def main(): parser = argparse.ArgumentParser( description='Scrape data for single bill, saving data to disk.', parents=[base_arg_parser], ) parser.add_argument('module', type=str, help='scraper module (eg. nc)') parser.add_argument('chamber', type=str, help='chamber for bill to scrape') parser.add_argument('session', type=str, help='session for bill to scrape') parser.add_argument('bill_id', type=str, help='bill_id to scrape') parser.add_argument('--strict', action='store_true', dest='strict', default=False, help="fail immediately when" "encountering validation warning") parser.add_argument('-n', '--no_cache', action='store_true', dest='no_cache', help="don't use web page cache") parser.add_argument('--fastmode', help="scrape in fast mode", action="store_true", default=False) parser.add_argument('-r', '--rpm', action='store', type=int, dest='rpm', default=60), parser.add_argument('--import', dest='do_import', help="import bill after scrape", action="store_true", default=False) args = parser.parse_args() settings.update(args) # set up search path sys.path.insert( 0, os.path.join(os.path.dirname(__file__), '../../openstates')) # get metadata metadata = __import__(args.module, fromlist=['metadata']).metadata abbr = metadata['abbreviation'] # configure logger configure_logging(args.verbose, abbr) args.output_dir = os.path.join(settings.BILLY_DATA_DIR, abbr) _run_scraper(args, metadata) if args.do_import: import_bills(abbr, settings.BILLY_DATA_DIR)
def main(): parser = argparse.ArgumentParser( description='Import scraped data into database.', parents=[base_arg_parser], ) parser.add_argument('abbreviation', type=str, help=('the short name of the data to import')) parser.add_argument('-r', '--rpm', type=int, default=60, help=('maximum number of documents to download ' 'per minute')) parser.add_argument('--bills', action='store_true', help='scrape bill data') parser.add_argument('--legislators', action='store_true', help='scrape legislator data') parser.add_argument('--committees', action='store_true', help='scrape (separate) committee data') parser.add_argument('--events', action='store_true', help='scrape event data') parser.add_argument('--alldata', action='store_true', dest='alldata', default=False, help="import all available data") args = parser.parse_args() if not (args.bills or args.legislators or args.committees or args.events or args.alldata): raise Exception("Must specify at least one type: --bills, " "--legislators, --committees, --events, " "--alldata") settings.update(args) data_dir = settings.BILLY_DATA_DIR # configure logger configure_logging(args.verbose, args.abbreviation) # always import metadata import_metadata(args.abbreviation, data_dir) if args.legislators or args.alldata: import_legislators(args.abbreviation, data_dir) if args.bills or args.alldata: import_bills(args.abbreviation, data_dir) if args.committees or args.alldata: import_committees(args.abbreviation, data_dir) # events currently excluded from --alldata if args.events: import_events(args.abbreviation, data_dir)
def main(): parser = argparse.ArgumentParser( description='Scrape data for state, saving data to disk.', parents=[base_arg_parser], ) parser.add_argument('state', type=str, help='state scraper module (eg. nc)') parser.add_argument('chamber', type=str, help='chamber for bill to scrape') parser.add_argument('session', type=str, help='session for bill to scrape') parser.add_argument('bill_id', type=str, help='bill_id to scrape') parser.add_argument('--strict', action='store_true', dest='strict', default=False, help="fail immediately when" "encountering validation warning") parser.add_argument('-n', '--no_cache', action='store_true', dest='no_cache', help="don't use web page cache") parser.add_argument('--fastmode', help="scrape in fast mode", action="store_true", default=False) parser.add_argument('-r', '--rpm', action='store', type=int, dest='rpm', default=60), parser.add_argument('--import', dest='do_import', help="import bill after scrape", action="store_true", default=False) args = parser.parse_args() settings.update(args) # set up search path sys.path.insert(0, os.path.join(os.path.dirname(__file__), '../../openstates')) # get metadata metadata = __import__(args.state, fromlist=['metadata']).metadata state = metadata['abbreviation'] # configure logger configure_logging(args.verbose, state) args.output_dir = os.path.join(settings.BILLY_DATA_DIR, args.state) _run_scraper(args.state, state, args, metadata) if args.do_import: import_bills(args.state, settings.BILLY_DATA_DIR)
def main(): import argparse configure_logging(1) parser = argparse.ArgumentParser( description=('Dump API information to a zipped directory of JSON files' ', optionally uploading to S3 when done.'), parents=[base_arg_parser], ) parser.add_argument( 'abbrs', metavar='ABBR', type=str, nargs='+', help=('the two-letter abbreviation for the data to export')) parser.add_argument('--file', '-f', help='filename to output to (defaults to <abbr>.zip)') parser.add_argument('--schema_dir', help='directory to use for API schemas (optional)', default=None) parser.add_argument('--nodump', action='store_true', default=False, help="don't run the dump, only upload") parser.add_argument('--novalidate', action='store_true', default=False, help="don't run validation") parser.add_argument('--upload', '-u', action='store_true', default=False, help='upload the created archive to S3') args = parser.parse_args() settings.update(args) for abbr in args.abbrs: if not args.file: args.file = abbr + '.zip' if not args.nodump: dump_json(abbr, args.file, not args.novalidate, args.schema_dir) if args.upload: upload(abbr, args.file)
def main(): try: parser = argparse.ArgumentParser( description="Scrape data for single bill, saving data to disk.", parents=[base_arg_parser] ) parser.add_argument("module", type=str, help="scraper module (eg. nc)") parser.add_argument("chamber", type=str, help="chamber for bill to scrape") parser.add_argument("session", type=str, help="session for bill to scrape") parser.add_argument("bill_id", type=str, help="bill_id to scrape") parser.add_argument( "--strict", action="store_true", dest="strict", default=False, help="fail immediately when" "encountering validation warning", ) parser.add_argument("-n", "--no_cache", action="store_true", dest="no_cache", help="don't use web page cache") parser.add_argument("--fastmode", help="scrape in fast mode", action="store_true", default=False) parser.add_argument("-r", "--rpm", action="store", type=int, dest="rpm", default=60), parser.add_argument( "--import", dest="do_import", help="import bill after scrape", action="store_true", default=False ) args = parser.parse_args() settings.update(args) # set up search path sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../../openstates")) # get metadata metadata = __import__(args.module, fromlist=["metadata"]).metadata abbr = metadata["abbreviation"] # configure logger configure_logging(args.verbose, abbr) args.output_dir = os.path.join(settings.BILLY_DATA_DIR, abbr) _run_scraper(args, metadata) if args.do_import: import_bills(abbr, settings.BILLY_DATA_DIR) except ScrapeError as e: print "Error:", e sys.exit(1)
def main(): parser = argparse.ArgumentParser( description='Convert state bills to SFM-ready text', parents=[base_arg_parser], ) parser.add_argument('state', type=str, help='state') parser.add_argument('--sfm_server', type=str, help='URL of SFM instance', default='http://localhost:8080/') args = parser.parse_args() settings.update(args) configure_logging(args.verbose, args.state) process_state_files(args.state, args.sfm_server)
def main(): try: parser = argparse.ArgumentParser( description='Scrape data for single bill, saving data to disk.', parents=[base_arg_parser], ) parser.add_argument('module', type=str, help='scraper module (eg. nc)') parser.add_argument('chamber', type=str, help='chamber for bill to scrape') parser.add_argument('session', type=str, help='session for bill to scrape') parser.add_argument('bill_id', type=str, help='bill_id to scrape') parser.add_argument('--strict', action='store_true', dest='strict', default=False, help="fail immediately when" "encountering validation warning") parser.add_argument('-n', '--no_cache', action='store_true', dest='no_cache', help="don't use web page cache") parser.add_argument('--fastmode', help="scrape in fast mode", action="store_true", default=False) parser.add_argument('-r', '--rpm', action='store', type=int, dest='rpm', default=60), parser.add_argument('--import', dest='do_import', help="import bill after scrape", action="store_true", default=False) args = parser.parse_args() settings.update(args) # get metadata metadata = __import__(args.module, fromlist=['metadata']).metadata abbr = metadata['abbreviation'] # configure logger configure_logging(args.verbose, abbr) args.output_dir = os.path.join(settings.BILLY_DATA_DIR, abbr) _run_scraper(args, metadata) if args.do_import: import_bills(abbr, settings.BILLY_DATA_DIR) except ScrapeError as e: print 'Error:', e sys.exit(1)
def main(): parser = argparse.ArgumentParser(description='generic billy util', parents=[base_arg_parser]) subparsers = parser.add_subparsers(dest='subcommand') # import command plugins for mod in COMMAND_MODULES: import_command_module(mod) # instantiate all subcommands subcommands = {} for SubcommandCls in BaseCommand.subcommands: subcommands[SubcommandCls.name] = SubcommandCls(subparsers) # parse arguments, update settings, then run the appropriate function args = parser.parse_args() settings.update(args) configure_logging(args.subcommand) subcommands[args.subcommand].handle(args)
def main(): import argparse configure_logging(1) parser = argparse.ArgumentParser( description=('Dump API information to a zipped directory of JSON files' ', optionally uploading to S3 when done.'), parents=[base_arg_parser], ) parser.add_argument('abbrs', metavar='ABBR', type=str, nargs='+', help=('the two-letter abbreviation for the data to export')) parser.add_argument('--file', '-f', help='filename to output to (defaults to <abbr>.zip)') parser.add_argument('--schema_dir', help='directory to use for API schemas (optional)', default=None) parser.add_argument('--nodump', action='store_true', default=False, help="don't run the dump, only upload") parser.add_argument('--novalidate', action='store_true', default=False, help="don't run validation") parser.add_argument('--upload', '-u', action='store_true', default=False, help='upload the created archive to S3') args = parser.parse_args() settings.update(args) for abbr in args.abbrs: if not args.file: args.file = abbr + '.zip' if not args.nodump: dump_json(abbr, args.file, not args.novalidate, args.schema_dir) if args.upload: upload(abbr, args.file)
def main(): parser = argparse.ArgumentParser( description='Scrape data for state, saving data to disk.', parents=[base_arg_parser], ) parser.add_argument('state', type=str, help='state scraper module (eg. nc)') parser.add_argument('-s', '--session', action='append', dest='sessions', help='session(s) to scrape') parser.add_argument('-t', '--term', action='append', dest='terms', help='term(s) to scrape') parser.add_argument('--upper', action='store_true', dest='upper', default=False, help='scrape upper chamber') parser.add_argument('--lower', action='store_true', dest='lower', default=False, help='scrape lower chamber') parser.add_argument('--bills', action='store_true', dest='bills', default=False, help="scrape bill data") parser.add_argument('--legislators', action='store_true', dest='legislators', default=False, help="scrape legislator data") parser.add_argument('--committees', action='store_true', dest='committees', default=False, help="scrape committee data") parser.add_argument('--votes', action='store_true', dest='votes', default=False, help="scrape vote data") parser.add_argument('--events', action='store_true', dest='events', default=False, help='scrape event data') parser.add_argument('--alldata', action='store_true', dest='alldata', default=False, help="scrape all available types of data") parser.add_argument('--strict', action='store_true', dest='strict', default=False, help="fail immediately when" "encountering validation warning") parser.add_argument('-n', '--no_cache', action='store_true', dest='no_cache', help="don't use web page cache") parser.add_argument('--fastmode', help="scrape in fast mode", action="store_true", default=False) parser.add_argument('-r', '--rpm', action='store', type=int, dest='rpm', default=60), parser.add_argument('--timeout', action='store', type=int, dest='timeout', default=10) args = parser.parse_args() settings.update(args) # set up search path sys.path.insert(0, os.path.join(os.path.dirname(__file__), '../../openstates')) # get metadata metadata = __import__(args.state, fromlist=['metadata']).metadata state = metadata['abbreviation'] configure_logging(args.verbose, args.state) # make output dir args.output_dir = os.path.join(settings.BILLY_DATA_DIR, args.state) try: os.makedirs(args.output_dir) except OSError as e: if e.errno != 17: raise e # write metadata try: schema_path = os.path.join(os.path.split(__file__)[0], '../schemas/metadata.json') schema = json.load(open(schema_path)) validator = DatetimeValidator() validator.validate(metadata, schema) except ValueError as e: logging.getLogger('billy').warning('metadata validation error: ' + str(e)) with open(os.path.join(args.output_dir, 'state_metadata.json'), 'w') as f: json.dump(metadata, f, cls=JSONDateEncoder) # determine time period to run for if args.terms: for term in metadata['terms']: if term in args.terms: args.sessions.extend(term['sessions']) args.sessions = set(args.sessions or []) # determine chambers args.chambers = [] if args.upper: args.chambers.append('upper') if args.lower: args.chambers.append('lower') if not args.chambers: args.chambers = ['upper', 'lower'] if not (args.bills or args.legislators or args.votes or args.committees or args.events or args.alldata): raise ScrapeError("Must specify at least one of --bills, " "--legislators, --committees, --votes, --events, " "--alldata") if args.alldata: args.bills = True args.legislators = True args.votes = True args.committees = True if args.bills: _run_scraper(args.state, state, 'bills', args, metadata) if args.legislators: _run_scraper(args.state, state, 'legislators', args, metadata) if args.committees: _run_scraper(args.state, state, 'committees', args, metadata) if args.votes: _run_scraper(args.state, state, 'votes', args, metadata) if args.events: _run_scraper(args.state, state, 'events', args, metadata)
help="import all available data") args = parser.parse_args() if not (args.bills or args.legislators or args.committees or args.events or args.alldata): raise Exception("Must specify at least one type: --bills, " "--legislators, --committees, --events, " "--alldata") settings.update(args) data_dir = settings.BILLY_DATA_DIR # configure logger configure_logging(args.verbose, args.abbreviation) # always import metadata import_metadata(args.abbreviation, data_dir) if args.legislators or args.alldata: import_legislators(args.abbreviation, data_dir) if args.bills or args.alldata: import_bills(args.abbreviation, data_dir) if args.committees or args.alldata: import_committees(args.abbreviation, data_dir) # events currently excluded from --alldata if args.events: import_events(args.abbreviation, data_dir)
k.key = s3_path logging.info('beginning upload to %s' % s3_url) k.set_contents_from_filename(filename) k.set_acl('public-read') meta['latest_dump_url'] = s3_url meta['latest_dump_date'] = datetime.datetime.utcnow() db.metadata.save(meta, safe=True) logging.info('upload complete') if __name__ == '__main__': import argparse configure_logging(1) parser = argparse.ArgumentParser( description=('Dump API information to a zipped directory of JSON files' ', optionally uploading to S3 when done.'), parents=[base_arg_parser], ) parser.add_argument( 'abbrs', metavar='ABBR', type=str, nargs='+', help=('the two-letter abbreviation for the data to export')) parser.add_argument('--file', '-f', help='filename to output to (defaults to <abbr>.zip)')
def main(old_scrape_compat=False): try: parser = argparse.ArgumentParser( description='update billy data', parents=[base_arg_parser], ) what = parser.add_argument_group( 'what to scrape', 'flags that help select what data to scrape') scrape = parser.add_argument_group('scraper config', 'settings for the scraper') parser.add_argument('module', type=str, help='scraper module (eg. nc)') what.add_argument('-s', '--session', action='append', dest='sessions', default=[], help='session(s) to scrape') what.add_argument('-t', '--term', action='append', dest='terms', help='term(s) to scrape', default=[]) for arg in ('upper', 'lower'): what.add_argument('--' + arg, action='append_const', dest='chambers', const=arg) for arg in ('bills', 'legislators', 'committees', 'votes', 'events'): what.add_argument('--' + arg, action='append_const', dest='types', const=arg) for arg in ('scrape', 'import', 'report'): parser.add_argument('--' + arg, dest='actions', action="append_const", const=arg, help='only run %s step' % arg) # special modes for debugging scrape.add_argument('--nonstrict', action='store_false', dest='strict', default=True, help="don't fail immediately when" " encountering validation warning") scrape.add_argument('--fastmode', help="scrape in fast mode", action="store_true", default=False) # scrapelib overrides scrape.add_argument('-r', '--rpm', action='store', type=int, dest='SCRAPELIB_RPM') scrape.add_argument('--timeout', action='store', type=int, dest='SCRAPELIB_TIMEOUT') scrape.add_argument('--retries', type=int, dest='SCRAPELIB_RETRY_ATTEMPTS') scrape.add_argument('--retry_wait', type=int, dest='SCRAPELIB_RETRY_WAIT_SECONDS') args = parser.parse_args() # inject scraper paths so scraper module can be found for newpath in settings.SCRAPER_PATHS: sys.path.insert(0, newpath) # get metadata module = __import__(args.module) metadata = module.metadata module_settings = getattr(module, 'settings', {}) abbrev = metadata['abbreviation'] # load state settings, then command line settings settings.update(module_settings) settings.update(args) configure_logging(args.module) # configure oyster if settings.ENABLE_OYSTER: from oyster.conf import settings as oyster_settings oyster_settings.DOCUMENT_CLASSES[ args.module + ':billtext'] = module.document_class # make output dir args.output_dir = os.path.join(settings.BILLY_DATA_DIR, abbrev) _clear_scraped_data(args.output_dir) # if terms aren't set, use latest if not args.terms: if args.sessions: for session in args.sessions: args.terms.append( term_for_session(metadata['abbreviation'], session, metadata)) args.terms = list(set(args.terms or [])) else: latest_term = metadata['terms'][-1]['name'] args.terms = [latest_term] # only set sessions from terms if sessions weren't set elif not args.sessions: for term in metadata['terms']: if term['name'] in args.terms: args.sessions.extend(term['sessions']) # dedup sessions args.sessions = list(set(args.sessions or [])) if not args.sessions: args.sessions = [metadata['terms'][-1]['sessions'][-1]] # determine chambers if not args.chambers: args.chambers = ['upper', 'lower'] if not args.actions: if old_scrape_compat: args.actions = ['scrape'] else: args.actions = ['scrape', 'import', 'report'] if not args.types: args.types = [ 'bills', 'legislators', 'votes', 'committees', 'alldata' ] if 'events' in metadata['feature_flags']: args.types.append('events') plan = """billy-update abbr=%s actions=%s types=%s sessions=%s terms=%s""" % (args.module, ','.join(args.actions), ','.join( args.types), ','.join(args.sessions), ','.join(args.terms)) logging.getLogger('billy').info(plan) scrape_data = {} if 'scrape' in args.actions: # validate then write metadata if hasattr(module, 'session_list'): session_list = module.session_list() else: session_list = [] check_sessions(metadata, session_list) try: schema_path = os.path.join( os.path.split(__file__)[0], '../schemas/metadata.json') schema = json.load(open(schema_path)) validator = DatetimeValidator() validator.validate(metadata, schema) except ValueError as e: logging.getLogger('billy').warning( 'metadata validation error: ' + str(e)) with open(os.path.join(args.output_dir, 'metadata.json'), 'w') as f: json.dump(metadata, f, cls=JSONDateEncoder) run_record = [] exec_record = { "run_record": run_record, "args": sys.argv, "state": abbrev } lex = None exc_traceback = None # start to run scrapers exec_start = dt.datetime.utcnow() # scraper order matters order = ('legislators', 'committees', 'votes', 'bills', 'events') try: for stype in order: if stype in args.types: run_record += _run_scraper(stype, args, metadata) except Exception as e: _traceback = _, _, exc_traceback = sys.exc_info() run_record += [{"exception": e, "type": stype}] lex = e exec_end = dt.datetime.utcnow() exec_record['started'] = exec_start exec_record['ended'] = exec_end scrape_data['scraped'] = exec_record scrape_data['state'] = abbrev for record in run_record: if "exception" in record: ex = record['exception'] fb = traceback.format_exception(*_traceback) trace = "" for t in fb: trace += t record['exception'] = { "type": ex.__class__.__name__, "message": ex.message, 'traceback': trace } scrape_data['failure'] = True if lex: if 'import' in args.actions: try: db.billy_runs.save(scrape_data, safe=True) except Exception: raise lex, None, exc_traceback # XXX: This should *NEVER* happen, but it has # in the past, so we're going to catch any errors # writing # to pymongo, and raise the original # exception rather then let it look like Mongo's fault. # Thanks for catching this, Thom. # # We lose the stack trace, but the Exception is the # same in every other way. # -- paultag raise # imports if 'import' in args.actions: import_report = _do_imports(abbrev, args) scrape_data['imported'] = import_report # We're tying the run-logging into the import stage - since import # already writes to the DB, we might as well throw this in too. db.billy_runs.save(scrape_data, safe=True) # reports if 'report' in args.actions: _do_reports(abbrev, args) except ScrapeError as e: print 'Error:', e sys.exit(1)
def main(old_scrape_compat=False): try: parser = argparse.ArgumentParser( description='update billy data', parents=[base_arg_parser], ) what = parser.add_argument_group('what to scrape', 'flags that help select what data to scrape') scrape = parser.add_argument_group('scraper config', 'settings for the scraper') parser.add_argument('module', type=str, help='scraper module (eg. nc)') what.add_argument('-s', '--session', action='append', dest='sessions', default=[], help='session(s) to scrape') what.add_argument('-t', '--term', action='append', dest='terms', help='term(s) to scrape', default=[]) for arg in ('upper', 'lower'): what.add_argument('--' + arg, action='append_const', dest='chambers', const=arg) for arg in ('bills', 'legislators', 'committees', 'votes', 'events'): what.add_argument('--' + arg, action='append_const', dest='types', const=arg) for arg in ('scrape', 'import', 'report'): parser.add_argument('--' + arg, dest='actions', action="append_const", const=arg, help='only run %s step' % arg) # special modes for debugging scrape.add_argument('--nonstrict', action='store_false', dest='strict', default=True, help="don't fail immediately when" " encountering validation warning") scrape.add_argument('--fastmode', help="scrape in fast mode", action="store_true", default=False) # scrapelib overrides scrape.add_argument('-r', '--rpm', action='store', type=int, dest='SCRAPELIB_RPM') scrape.add_argument('--timeout', action='store', type=int, dest='SCRAPELIB_TIMEOUT') scrape.add_argument('--retries', type=int, dest='SCRAPELIB_RETRY_ATTEMPTS') scrape.add_argument('--retry_wait', type=int, dest='SCRAPELIB_RETRY_WAIT_SECONDS') args = parser.parse_args() # inject scraper paths so scraper module can be found for newpath in settings.SCRAPER_PATHS: sys.path.insert(0, newpath) # get metadata module = __import__(args.module) metadata = module.metadata module_settings = getattr(module, 'settings', {}) abbrev = metadata['abbreviation'] # load state settings, then command line settings settings.update(module_settings) settings.update(args) configure_logging(args.module) # configure oyster if settings.ENABLE_OYSTER: from oyster.conf import settings as oyster_settings oyster_settings.DOCUMENT_CLASSES[args.module + ':billtext'] = module.document_class # make output dir args.output_dir = os.path.join(settings.BILLY_DATA_DIR, abbrev) _clear_scraped_data(args.output_dir) # if terms aren't set, use latest if not args.terms: if args.sessions: for session in args.sessions: args.terms.append( term_for_session(metadata['abbreviation'], session, metadata)) args.terms = list(set(args.terms or [])) else: latest_term = metadata['terms'][-1]['name'] args.terms = [latest_term] # only set sessions from terms if sessions weren't set elif not args.sessions: for term in metadata['terms']: if term['name'] in args.terms: args.sessions.extend(term['sessions']) # dedup sessions args.sessions = list(set(args.sessions or [])) if not args.sessions: args.sessions = [metadata['terms'][-1]['sessions'][-1]] # determine chambers if not args.chambers: args.chambers = ['upper', 'lower'] if not args.actions: if old_scrape_compat: args.actions = ['scrape'] else: args.actions = ['scrape', 'import', 'report'] if not args.types: args.types = ['bills', 'legislators', 'votes', 'committees', 'alldata'] if 'events' in metadata['feature_flags']: args.types.append('events') plan = """billy-update abbr=%s actions=%s types=%s sessions=%s terms=%s""" % (args.module, ','.join(args.actions), ','.join(args.types), ','.join(args.sessions), ','.join(args.terms)) logging.getLogger('billy').info(plan) scrape_data = {} if 'scrape' in args.actions: # validate then write metadata if hasattr(module, 'session_list'): session_list = module.session_list() else: session_list = [] check_sessions(metadata, session_list) try: schema_path = os.path.join(os.path.split(__file__)[0], '../schemas/metadata.json') schema = json.load(open(schema_path)) validator = DatetimeValidator() validator.validate(metadata, schema) except ValueError as e: logging.getLogger('billy').warning( 'metadata validation error: ' + str(e)) with open(os.path.join(args.output_dir, 'metadata.json'), 'w') as f: json.dump(metadata, f, cls=JSONDateEncoder) run_record = [] exec_record = { "run_record": run_record, "args": sys.argv, "state": abbrev } lex = None exc_traceback = None # start to run scrapers exec_start = dt.datetime.utcnow() # scraper order matters order = ('legislators', 'committees', 'votes', 'bills', 'events') try: for stype in order: if stype in args.types: run_record += _run_scraper(stype, args, metadata) except Exception as e: _traceback = _, _, exc_traceback = sys.exc_info() run_record += [{"exception": e, "type": stype }] lex = e exec_end = dt.datetime.utcnow() exec_record['started'] = exec_start exec_record['ended'] = exec_end scrape_data['scraped'] = exec_record scrape_data['state'] = abbrev for record in run_record: if "exception" in record: ex = record['exception'] fb = traceback.format_exception(*_traceback) trace = "" for t in fb: trace += t record['exception'] = { "type": ex.__class__.__name__, "message": ex.message, 'traceback': trace } scrape_data['failure'] = True if lex: if 'import' in args.actions: try: db.billy_runs.save(scrape_data, safe=True) except Exception: raise lex, None, exc_traceback # XXX: This should *NEVER* happen, but it has # in the past, so we're going to catch any errors # writing # to pymongo, and raise the original # exception rather then let it look like Mongo's fault. # Thanks for catching this, Thom. # # We lose the stack trace, but the Exception is the # same in every other way. # -- paultag raise # imports if 'import' in args.actions: import_report = _do_imports(abbrev, args) scrape_data['imported'] = import_report # We're tying the run-logging into the import stage - since import # already writes to the DB, we might as well throw this in too. db.billy_runs.save(scrape_data, safe=True) # reports if 'report' in args.actions: _do_reports(abbrev, args) except ScrapeError as e: print 'Error:', e sys.exit(1)
default=False, help="import all available data") args = parser.parse_args() if not (args.bills or args.legislators or args.committees or args.events or args.alldata): raise Exception("Must specify at least one type: --bills, " "--legislators, --committees, --events, " "--alldata") settings.update(args) data_dir = settings.BILLY_DATA_DIR # configure logger configure_logging(args.verbose, args.abbreviation) # always import metadata import_metadata(args.abbreviation, data_dir) if args.legislators or args.alldata: import_legislators(args.abbreviation, data_dir) if args.bills or args.alldata: import_bills(args.abbreviation, data_dir) if args.committees or args.alldata: import_committees(args.abbreviation, data_dir) # events currently excluded from --alldata if args.events: import_events(args.abbreviation, data_dir)
#!/usr/bin/env python from __future__ import print_function from billy.utils import configure_logging, term_for_session # we need this before the command line args are read in configure_logging("startup") import logging _log = logging.getLogger('billy') import importlib #import bson.binary #from bson.binary import ALL_UUID_SUBTYPES #from bson.binary import OLD_UUID_SUBTYPE import os import sys import json import glob import logging import inspect import argparse import traceback #import importlib import unicodecsv import os.path import datetime as dt import pdb # code snippet, to be included in 'sitecustomize.py' from billy.scrape.validator import DatetimeValidator #from pymongo.errors import OperationFailure import pymongo
params['literal.state'] = doc.metadata['bill']['state'] params['literal.chamber'] = doc.metadata['bill']['chamber'] params['literal.bill_title'] = ( doc.metadata['bill']['title'].encode('ascii', 'replace')) params['literal.document_name'] = doc.metadata['name'] params['literal.url'] = doc.metadata['url'] params['literal.id'] = version['document_id'] params['commit'] = 'false' url = "%supdate/extract?%s" % (solr_url, urllib.urlencode(params)) req = urllib2.Request(url, {'file': doc}) urllib2.urlopen(req) if __name__ == '__main__': parser = argparse.ArgumentParser( parents=[base_arg_parser], description="Download and store copies of bill versions.") parser.add_argument('-u', '--url', type=str, dest='url', default='http://localhost:8983/solr/', help='the solr instance URL') args = parser.parse_args() configure_logging(args.verbose, args.state) import_versions(args.state, args.url)
params = {} params['literal.bill_id'] = doc.metadata['bill']['bill_id'] params['literal.state'] = doc.metadata['bill']['state'] params['literal.chamber'] = doc.metadata['bill']['chamber'] params['literal.bill_title'] = ( doc.metadata['bill']['title'].encode('ascii', 'replace')) params['literal.document_name'] = doc.metadata['name'] params['literal.url'] = doc.metadata['url'] params['literal.id'] = version['document_id'] params['commit'] = 'false' url = "%supdate/extract?%s" % (solr_url, urllib.urlencode(params)) req = urllib2.Request(url, {'file': doc}) urllib2.urlopen(req) if __name__ == '__main__': parser = argparse.ArgumentParser( parents=[base_arg_parser], description="Download and store copies of bill versions.") parser.add_argument('-u', '--url', type=str, dest='url', default='http://localhost:8983/solr/', help='the solr instance URL') args = parser.parse_args() configure_logging(args.verbose, args.state) import_versions(args.state, args.url)
k.key = s3_path logging.info('beginning upload to %s' % s3_url) k.set_contents_from_filename(filename) k.set_acl('public-read') meta['latest_dump_url'] = s3_url meta['latest_dump_date'] = datetime.datetime.utcnow() db.metadata.save(meta, safe=True) logging.info('upload complete') if __name__ == '__main__': import argparse configure_logging(1) parser = argparse.ArgumentParser( description=('Dump API information to a zipped directory of JSON files' ', optionally uploading to S3 when done.'), parents=[base_arg_parser], ) parser.add_argument('abbrs', metavar='ABBR', type=str, nargs='+', help=('the two-letter abbreviation for the data to export')) parser.add_argument('--file', '-f', help='filename to output to (defaults to <abbr>.zip)') parser.add_argument('--schema_dir', help='directory to use for API schemas (optional)', default=None) parser.add_argument('--nodump', action='store_true', default=False, help="don't run the dump, only upload")
import argparse import logging from billy.conf import settings, base_arg_parser from billy.utils import configure_logging from billy.commands import BaseCommand logger = logging.getLogger('billy') configure_logging() COMMAND_MODULES = ( # lots of these commands can go away as billy matures 'billy.commands.serve', # useful for development 'billy.commands.textextract', # useful for development 'billy.commands.load_legislators', # allow editing legislators in admin 'billy.commands.download_photos', 'billy.commands.dump', 'billy.commands.update_external_ids', 'billy.commands.update_leg_ids', 'billy.commands.validate_api', ) if settings.ENABLE_OYSTER: COMMAND_MODULES += ('billy.commands.oysterize',) def import_command_module(mod): try: __import__(mod) except ImportError, e: logger.warning(
import argparse import logging from billy.conf import settings, base_arg_parser from billy.utils import configure_logging from billy.commands import BaseCommand logger = logging.getLogger('billy') configure_logging() COMMAND_MODULES = ( # lots of these commands can go away as billy matures 'billy.commands.serve', # useful for development 'billy.commands.textextract', # useful for development 'billy.commands.load_legislators', # allow editing legislators in admin 'billy.commands.download_photos', 'billy.commands.dump', 'billy.commands.update_external_ids', 'billy.commands.update_leg_ids', 'billy.commands.validate_api', ) if settings.ENABLE_OYSTER: COMMAND_MODULES += ('billy.commands.oysterize', ) def import_command_module(mod): try: __import__(mod) except ImportError, e: logger.warning('error "{0}" prevented loading of {1} module'.format(