def __init__(self, metadata, no_cache=False, output_dir=None, strict_validation=None, **kwargs): """ Create a new Scraper instance. :param metadata: metadata for this scraper :param no_cache: if True, will ignore any cached downloads :param output_dir: the data directory to use :param strict_validation: exit immediately if validation fails """ # configure underlying scrapelib object if no_cache: kwargs['cache_dir'] = None elif 'cache_dir' not in kwargs: kwargs['cache_dir'] = settings.BILLY_CACHE_DIR if 'error_dir' not in kwargs: kwargs['error_dir'] = settings.BILLY_ERROR_DIR if 'timeout' not in kwargs: kwargs['timeout'] = settings.SCRAPELIB_TIMEOUT if 'requests_per_minute' not in kwargs: kwargs['requests_per_minute'] = None if 'retry_attempts' not in kwargs: kwargs['retry_attempts'] = settings.SCRAPELIB_RETRY_ATTEMPTS if 'retry_wait_seconds' not in kwargs: kwargs['retry_wait_seconds'] = \ settings.SCRAPELIB_RETRY_WAIT_SECONDS super(Scraper, self).__init__(**kwargs) for f in settings.BILLY_LEVEL_FIELDS[self.level]: if not hasattr(self, f): raise Exception('%s scrapers must have a %s attribute' % (self.level, f)) self.metadata = metadata self.output_dir = output_dir # validation self.strict_validation = strict_validation self.validator = DatetimeValidator() self.follow_robots = False # logging convenience methods self.logger = logging.getLogger("billy") self.log = self.logger.info self.debug = self.logger.debug self.warning = self.logger.warning
def __init__(self, metadata, output_dir=None, strict_validation=None, fastmode=False): """ Create a new Scraper instance. :param metadata: metadata for this scraper :param output_dir: the data directory to use :param strict_validation: exit immediately if validation fails """ super(Scraper, self).__init__() # scrapelib overrides self.timeout = settings.SCRAPELIB_TIMEOUT self.cache_storage = scrapelib.FileCache(settings.BILLY_CACHE_DIR) self.requests_per_minute = settings.SCRAPELIB_RPM self.retry_attempts = settings.SCRAPELIB_RETRY_ATTEMPTS self.retry_wait_seconds = settings.SCRAPELIB_RETRY_WAIT_SECONDS if fastmode: self.requests_per_minute = 0 self.cache_write_only = False # if scraper uses dryscrape, set up session if settings.USES_DRYSCRAPE: dryscrape.start_xvfb() self.session = dryscrape.Session() self.metadata = metadata self.output_dir = output_dir self.output_names = set() # make output_dir os.path.isdir(self.output_dir) or os.path.makedirs(self.output_dir) # validation self.strict_validation = strict_validation self.validator = DatetimeValidator() self._schema = {} self._load_schemas() # logging convenience methods self.logger = logging.getLogger("billy") self.log = self.logger.info self.info = self.logger.info self.debug = self.logger.debug self.warning = self.logger.warning self.error = self.logger.error self.critical = self.logger.critical
def __init__(self, metadata, output_dir=None, strict_validation=None, fastmode=False, **kwargs): """ Create a new Scraper instance. :param metadata: metadata for this scraper :param output_dir: the data directory to use :param strict_validation: exit immediately if validation fails """ # configure underlying scrapelib object kwargs['cache_obj'] = scrapelib.FileCache(settings.BILLY_CACHE_DIR) kwargs['requests_per_minute'] = settings.SCRAPELIB_RPM kwargs['timeout'] = settings.SCRAPELIB_TIMEOUT kwargs['retry_attempts'] = settings.SCRAPELIB_RETRY_ATTEMPTS kwargs['retry_wait_seconds'] = settings.SCRAPELIB_RETRY_WAIT_SECONDS if fastmode: kwargs['requests_per_minute'] = 0 kwargs['cache_write_only'] = False super(Scraper, self).__init__(**kwargs) self.metadata = metadata self.output_dir = output_dir self.output_names = set() # make output_dir os.path.isdir(self.output_dir) or os.path.makedirs(self.output_dir) # validation self.strict_validation = strict_validation self.validator = DatetimeValidator() self._schema = {} self._load_schemas() self.follow_robots = False # logging convenience methods self.logger = logging.getLogger("billy") self.log = self.logger.info self.info = self.logger.info self.debug = self.logger.debug self.warning = self.logger.warning self.error = self.logger.error self.critical = self.logger.critical
def main(): parser = argparse.ArgumentParser( description='Scrape data for state, saving data to disk.', parents=[base_arg_parser], ) parser.add_argument('state', type=str, help='state scraper module (eg. nc)') parser.add_argument('-s', '--session', action='append', dest='sessions', help='session(s) to scrape') parser.add_argument('-t', '--term', action='append', dest='terms', help='term(s) to scrape') parser.add_argument('--upper', action='store_true', dest='upper', default=False, help='scrape upper chamber') parser.add_argument('--lower', action='store_true', dest='lower', default=False, help='scrape lower chamber') parser.add_argument('--bills', action='store_true', dest='bills', default=False, help="scrape bill data") parser.add_argument('--legislators', action='store_true', dest='legislators', default=False, help="scrape legislator data") parser.add_argument('--committees', action='store_true', dest='committees', default=False, help="scrape committee data") parser.add_argument('--votes', action='store_true', dest='votes', default=False, help="scrape vote data") parser.add_argument('--events', action='store_true', dest='events', default=False, help='scrape event data') parser.add_argument('--alldata', action='store_true', dest='alldata', default=False, help="scrape all available types of data") parser.add_argument('--strict', action='store_true', dest='strict', default=False, help="fail immediately when" "encountering validation warning") parser.add_argument('-n', '--no_cache', action='store_true', dest='no_cache', help="don't use web page cache") parser.add_argument('--fastmode', help="scrape in fast mode", action="store_true", default=False) parser.add_argument('-r', '--rpm', action='store', type=int, dest='rpm', default=60), parser.add_argument('--timeout', action='store', type=int, dest='timeout', default=10) args = parser.parse_args() settings.update(args) # set up search path sys.path.insert(0, os.path.join(os.path.dirname(__file__), '../../openstates')) # get metadata metadata = __import__(args.state, fromlist=['metadata']).metadata state = metadata['abbreviation'] configure_logging(args.verbose, args.state) # make output dir args.output_dir = os.path.join(settings.BILLY_DATA_DIR, args.state) try: os.makedirs(args.output_dir) except OSError as e: if e.errno != 17: raise e # write metadata try: schema_path = os.path.join(os.path.split(__file__)[0], '../schemas/metadata.json') schema = json.load(open(schema_path)) validator = DatetimeValidator() validator.validate(metadata, schema) except ValueError as e: logging.getLogger('billy').warning('metadata validation error: ' + str(e)) with open(os.path.join(args.output_dir, 'state_metadata.json'), 'w') as f: json.dump(metadata, f, cls=JSONDateEncoder) # determine time period to run for if args.terms: for term in metadata['terms']: if term in args.terms: args.sessions.extend(term['sessions']) args.sessions = set(args.sessions or []) # determine chambers args.chambers = [] if args.upper: args.chambers.append('upper') if args.lower: args.chambers.append('lower') if not args.chambers: args.chambers = ['upper', 'lower'] if not (args.bills or args.legislators or args.votes or args.committees or args.events or args.alldata): raise ScrapeError("Must specify at least one of --bills, " "--legislators, --committees, --votes, --events, " "--alldata") if args.alldata: args.bills = True args.legislators = True args.votes = True args.committees = True if args.bills: _run_scraper(args.state, state, 'bills', args, metadata) if args.legislators: _run_scraper(args.state, state, 'legislators', args, metadata) if args.committees: _run_scraper(args.state, state, 'committees', args, metadata) if args.votes: _run_scraper(args.state, state, 'votes', args, metadata) if args.events: _run_scraper(args.state, state, 'events', args, metadata)
def main(): try: parser = argparse.ArgumentParser( description='update billy data', parents=[base_arg_parser], ) what = parser.add_argument_group( 'what to scrape', 'flags that help select what data to scrape') scrape = parser.add_argument_group('scraper config', 'settings for the scraper') parser.add_argument('module', type=str, help='scraper module (eg. nc)') parser.add_argument('--pdb', action='store_true', default=False, help='invoke PDB when exception is raised') parser.add_argument('--ipdb', action='store_true', default=False, help='invoke PDB when exception is raised') parser.add_argument('--pudb', action='store_true', default=False, help='invoke PUDB when exception is raised') what.add_argument('-s', '--session', action='append', dest='sessions', default=[], help='session(s) to scrape') what.add_argument('-t', '--term', action='append', dest='terms', help='term(s) to scrape', default=[]) for arg in ('upper', 'lower'): what.add_argument('--' + arg, action='append_const', dest='chambers', const=arg) for arg in ('bills', 'legislators', 'committees', 'votes', 'events', 'speeches'): what.add_argument('--' + arg, action='append_const', dest='types', const=arg) for arg in ('scrape', 'import', 'report', 'session-list'): parser.add_argument('--' + arg, dest='actions', action="append_const", const=arg, help='only run %s step' % arg) # special modes for debugging scrape.add_argument('--nonstrict', action='store_false', dest='strict', default=True, help="don't fail immediately when" " encountering validation warning") scrape.add_argument('--fastmode', help="scrape in fast mode", action="store_true", default=False) # scrapelib overrides scrape.add_argument('-r', '--rpm', action='store', type=int, dest='SCRAPELIB_RPM') scrape.add_argument('--timeout', action='store', type=int, dest='SCRAPELIB_TIMEOUT') scrape.add_argument('--retries', type=int, dest='SCRAPELIB_RETRY_ATTEMPTS') scrape.add_argument('--retry_wait', type=int, dest='SCRAPELIB_RETRY_WAIT_SECONDS') args = parser.parse_args() if args.pdb or args.pudb or args.ipdb: _debugger = pdb if args.pudb: try: import pudb _debugger = pudb except ImportError: pass if args.ipdb: try: import ipdb _debugger = ipdb except ImportError: pass # turn on PDB-on-error mode # stolen from http://stackoverflow.com/questions/1237379/ # if this causes problems in interactive mode check that page def _tb_info(type, value, tb): traceback.print_exception(type, value, tb) _debugger.pm() sys.excepthook = _tb_info # inject scraper paths so scraper module can be found for newpath in settings.SCRAPER_PATHS: sys.path.insert(0, newpath) # get metadata module = importlib.import_module(args.module) metadata = module.metadata module_settings = getattr(module, 'settings', {}) abbrev = metadata['abbreviation'] # load module settings, then command line settings settings.update(module_settings) settings.update(args) # make output dir args.output_dir = os.path.join(settings.BILLY_DATA_DIR, abbrev) # if terms aren't set, use latest if not args.terms: if args.sessions: for session in args.sessions: args.terms.append( term_for_session(metadata['abbreviation'], session, metadata)) args.terms = list(set(args.terms or [])) else: latest_term = metadata['terms'][-1]['name'] args.terms = [latest_term] # only set sessions from terms if sessions weren't set elif not args.sessions: for term in metadata['terms']: if term['name'] in args.terms: args.sessions.extend(term['sessions']) # dedup sessions args.sessions = list(set(args.sessions or [])) if not args.sessions: args.sessions = [metadata['terms'][-1]['sessions'][-1]] # determine chambers if not args.chambers: args.chambers = ['upper', 'lower'] if not args.actions: args.actions = ['scrape', 'import', 'report'] if not args.types: args.types = ['bills', 'legislators', 'votes', 'committees', 'alldata'] if 'events' in metadata['feature_flags']: args.types.append('events') if 'speeches' in metadata['feature_flags']: args.types.append('speeches') plan = """billy-update abbr=%s actions=%s types=%s sessions=%s terms=%s""" % (args.module, ','.join(args.actions), ','.join(args.types), ','.join(args.sessions), ','.join(args.terms)) logging.getLogger('billy').info(plan) scrape_data = {} if 'scrape' in args.actions: _clear_scraped_data(args.output_dir) # validate then write metadata if hasattr(module, 'session_list'): session_list = module.session_list() else: session_list = [] check_sessions(metadata, session_list) try: schema_path = os.path.join(os.path.split(__file__)[0], '../schemas/metadata.json') schema = json.load(open(schema_path)) validator = DatetimeValidator() validator.validate(metadata, schema) except ValueError as e: logging.getLogger('billy').warning( 'metadata validation error: ' + str(e)) run_record = [] exec_record = { "run_record": run_record, "args": sys.argv, } lex = None exc_traceback = None # start to run scrapers exec_start = dt.datetime.utcnow() # scraper order matters order = ('legislators', 'committees', 'votes', 'bills', 'events', 'speeches') _traceback = None try: for stype in order: if stype in args.types: run_record += _run_scraper(stype, args, metadata) except Exception as e: _traceback = _, _, exc_traceback = sys.exc_info() run_record += [{"exception": e, "type": stype}] lex = e exec_end = dt.datetime.utcnow() exec_record['started'] = exec_start exec_record['ended'] = exec_end scrape_data['scraped'] = exec_record scrape_data['abbr'] = abbrev for record in run_record: if "exception" in record: ex = record['exception'] fb = traceback.format_exception(*_traceback) trace = "" for t in fb: trace += t record['exception'] = { "type": ex.__class__.__name__, "message": ex.message, 'traceback': trace } scrape_data['failure'] = True if lex: if 'import' in args.actions: try: db.billy_runs.save(scrape_data, safe=True) except Exception: raise lex, None, exc_traceback # XXX: This should *NEVER* happen, but it has # in the past, so we're going to catch any errors # writing # to pymongo, and raise the original # exception rather then let it look like Mongo's fault. # Thanks for catching this, Thom. # # We lose the stack trace, but the Exception is the # same in every other way. # -- paultag raise # imports if 'import' in args.actions: import_report = _do_imports(abbrev, args) scrape_data['imported'] = import_report # We're tying the run-logging into the import stage - since import # already writes to the DB, we might as well throw this in too. db.billy_runs.save(scrape_data, safe=True) # reports if 'report' in args.actions: _do_reports(abbrev, args) if 'session-list' in args.actions: if hasattr(module, 'session_list'): print("\n".join(module.session_list())) else: raise ScrapeError('session_list() is not defined') except ScrapeError as e: logging.getLogger('billy').critical('Error: %s', e) sys.exit(1)
# make output dir args.output_dir = os.path.join(settings.BILLY_DATA_DIR, state) try: os.makedirs(args.output_dir) except OSError, e: if e.errno != 17: raise e # write metadata try: schema_path = os.path.join( os.path.split(__file__)[0], '../schemas/metadata.json') schema = json.load(open(schema_path)) validator = DatetimeValidator() validator.validate(metadata, schema) except ValueError, e: logging.getLogger('billy').warning('metadata validation error: ' + str(e)) with open(os.path.join(args.output_dir, 'state_metadata.json'), 'w') as f: json.dump(metadata, f, cls=JSONDateEncoder) # determine time period to run for if args.terms: for term in metadata['terms']: if term in args.terms: args.sessions.extend(term['sessions']) args.sessions = set(args.sessions or [])