def __init__(self, metadata, no_cache=False, output_dir=None, strict_validation=None, **kwargs): """ Create a new Scraper instance. :param metadata: metadata for this scraper :param no_cache: if True, will ignore any cached downloads :param output_dir: the data directory to use :param strict_validation: exit immediately if validation fails """ # configure underlying scrapelib object if no_cache: kwargs['cache_dir'] = None elif 'cache_dir' not in kwargs: kwargs['cache_dir'] = settings.BILLY_CACHE_DIR if 'error_dir' not in kwargs: kwargs['error_dir'] = settings.BILLY_ERROR_DIR if 'timeout' not in kwargs: kwargs['timeout'] = settings.SCRAPELIB_TIMEOUT if 'requests_per_minute' not in kwargs: kwargs['requests_per_minute'] = None if 'retry_attempts' not in kwargs: kwargs['retry_attempts'] = settings.SCRAPELIB_RETRY_ATTEMPTS if 'retry_wait_seconds' not in kwargs: kwargs['retry_wait_seconds'] = \ settings.SCRAPELIB_RETRY_WAIT_SECONDS super(Scraper, self).__init__(**kwargs) for f in settings.BILLY_LEVEL_FIELDS[self.level]: if not hasattr(self, f): raise Exception('%s scrapers must have a %s attribute' % (self.level, f)) self.metadata = metadata self.output_dir = output_dir # validation self.strict_validation = strict_validation self.validator = DatetimeValidator() self.follow_robots = False # logging convenience methods self.logger = logging.getLogger("billy") self.log = self.logger.info self.debug = self.logger.debug self.warning = self.logger.warning
def __init__(self, metadata, output_dir=None, strict_validation=None, fastmode=False): """ Create a new Scraper instance. :param metadata: metadata for this scraper :param output_dir: the data directory to use :param strict_validation: exit immediately if validation fails """ super(Scraper, self).__init__() # scrapelib overrides self.timeout = settings.SCRAPELIB_TIMEOUT self.cache_storage = scrapelib.FileCache(settings.BILLY_CACHE_DIR) self.requests_per_minute = settings.SCRAPELIB_RPM self.retry_attempts = settings.SCRAPELIB_RETRY_ATTEMPTS self.retry_wait_seconds = settings.SCRAPELIB_RETRY_WAIT_SECONDS if fastmode: self.requests_per_minute = 0 self.cache_write_only = False # if scraper uses dryscrape, set up session if settings.USES_DRYSCRAPE: dryscrape.start_xvfb() self.session = dryscrape.Session() self.metadata = metadata self.output_dir = output_dir self.output_names = set() # make output_dir os.path.isdir(self.output_dir) or os.path.makedirs(self.output_dir) # validation self.strict_validation = strict_validation self.validator = DatetimeValidator() self._schema = {} self._load_schemas() # logging convenience methods self.logger = logging.getLogger("billy") self.log = self.logger.info self.info = self.logger.info self.debug = self.logger.debug self.warning = self.logger.warning self.error = self.logger.error self.critical = self.logger.critical
def __init__(self, metadata, output_dir=None, strict_validation=None, fastmode=False, **kwargs): """ Create a new Scraper instance. :param metadata: metadata for this scraper :param output_dir: the data directory to use :param strict_validation: exit immediately if validation fails """ # configure underlying scrapelib object kwargs['cache_obj'] = scrapelib.FileCache(settings.BILLY_CACHE_DIR) kwargs['requests_per_minute'] = settings.SCRAPELIB_RPM kwargs['timeout'] = settings.SCRAPELIB_TIMEOUT kwargs['retry_attempts'] = settings.SCRAPELIB_RETRY_ATTEMPTS kwargs['retry_wait_seconds'] = settings.SCRAPELIB_RETRY_WAIT_SECONDS if fastmode: kwargs['requests_per_minute'] = 0 kwargs['cache_write_only'] = False super(Scraper, self).__init__(**kwargs) self.metadata = metadata self.output_dir = output_dir self.output_names = set() # make output_dir os.path.isdir(self.output_dir) or os.path.makedirs(self.output_dir) # validation self.strict_validation = strict_validation self.validator = DatetimeValidator() self._schema = {} self._load_schemas() self.follow_robots = False # logging convenience methods self.logger = logging.getLogger("billy") self.log = self.logger.info self.info = self.logger.info self.debug = self.logger.debug self.warning = self.logger.warning self.error = self.logger.error self.critical = self.logger.critical
def __init__(self, metadata, no_cache=False, output_dir=None, strict_validation=None, **kwargs): """ Create a new Scraper instance. :param metadata: metadata for this scraper :param no_cache: if True, will ignore any cached downloads :param output_dir: the data directory to use :param strict_validation: exit immediately if validation fails """ # configure underlying scrapelib object if no_cache: kwargs['cache_dir'] = None elif 'cache_dir' not in kwargs: kwargs['cache_dir'] = settings.BILLY_CACHE_DIR if 'error_dir' not in kwargs: kwargs['error_dir'] = settings.BILLY_ERROR_DIR if 'timeout' not in kwargs: kwargs['timeout'] = settings.SCRAPELIB_TIMEOUT if 'requests_per_minute' not in kwargs: kwargs['requests_per_minute'] = None if 'retry_attempts' not in kwargs: kwargs['retry_attempts'] = settings.SCRAPELIB_RETRY_ATTEMPTS if 'retry_wait_seconds' not in kwargs: kwargs['retry_wait_seconds'] = \ settings.SCRAPELIB_RETRY_WAIT_SECONDS super(Scraper, self).__init__(**kwargs) for f in settings.BILLY_LEVEL_FIELDS[self.level]: if not hasattr(self, f): raise Exception('%s scrapers must have a %s attribute' % ( self.level, f)) self.metadata = metadata self.output_dir = output_dir # validation self.strict_validation = strict_validation self.validator = DatetimeValidator() self.follow_robots = False # logging convenience methods self.logger = logging.getLogger("billy") self.log = self.logger.info self.debug = self.logger.debug self.warning = self.logger.warning
def __init__(self, metadata, output_dir=None, strict_validation=None, fastmode=False): """ Create a new Scraper instance. :param metadata: metadata for this scraper :param output_dir: the data directory to use :param strict_validation: exit immediately if validation fails """ super(Scraper, self).__init__() # scrapelib overrides self.timeout = settings.SCRAPELIB_TIMEOUT self.cache_storage = scrapelib.FileCache(settings.BILLY_CACHE_DIR) self.requests_per_minute = settings.SCRAPELIB_RPM self.retry_attempts = settings.SCRAPELIB_RETRY_ATTEMPTS self.retry_wait_seconds = settings.SCRAPELIB_RETRY_WAIT_SECONDS if fastmode: self.requests_per_minute = 0 self.cache_write_only = False self.filter_bill_id = False self.metadata = metadata self.output_dir = output_dir self.output_names = set() # make output_dir if self.output_dir is None: _log.debug("output_dir is none") raise Exception("output_dir missing") _log.debug("output_dir %s" % self.output_dir) print ("output_dir: %s" % self.output_dir) os.path.isdir(self.output_dir) or os.path.makedirs(self.output_dir) # validation self.strict_validation = strict_validation self.validator = DatetimeValidator() self._schema = {} self._load_schemas() self.follow_robots = False # logging convenience methods self.logger = logging.getLogger("billy") self.log = self.logger.info self.info = self.logger.info self.debug = self.logger.debug self.warning = self.logger.warning self.error = self.logger.error self.critical = self.logger.critical
class Scraper(scrapelib.Scraper): """ Base class for all Scrapers Provides several useful methods for retrieving URLs and checking arguments against metadata. """ __metaclass__ = ScraperMeta latest_only = False def __init__(self, metadata, output_dir=None, strict_validation=None, fastmode=False, **kwargs): """ Create a new Scraper instance. :param metadata: metadata for this scraper :param output_dir: the data directory to use :param strict_validation: exit immediately if validation fails """ # configure underlying scrapelib object kwargs["cache_obj"] = scrapelib.FileCache(settings.BILLY_CACHE_DIR) kwargs["requests_per_minute"] = settings.SCRAPELIB_RPM kwargs["timeout"] = settings.SCRAPELIB_TIMEOUT kwargs["retry_attempts"] = settings.SCRAPELIB_RETRY_ATTEMPTS kwargs["retry_wait_seconds"] = settings.SCRAPELIB_RETRY_WAIT_SECONDS if fastmode: kwargs["requests_per_minute"] = 0 kwargs["cache_write_only"] = False super(Scraper, self).__init__(**kwargs) self.metadata = metadata self.output_dir = output_dir self.output_names = set() # make output_dir os.path.isdir(self.output_dir) or os.path.makedirs(self.output_dir) # validation self.strict_validation = strict_validation self.validator = DatetimeValidator() self._schema = {} self._load_schemas() self.follow_robots = False # logging convenience methods self.logger = logging.getLogger("billy") self.log = self.logger.info self.info = self.logger.info self.debug = self.logger.debug self.warning = self.logger.warning self.error = self.logger.error self.critical = self.logger.critical def _load_schemas(self): """ load all schemas into schema dict """ types = ("bill", "committee", "person", "vote", "event", "speech") for type in types: schema_path = os.path.join(os.path.split(__file__)[0], "../schemas/%s.json" % type) self._schema[type] = json.load(open(schema_path)) self._schema[type]["properties"][settings.LEVEL_FIELD] = {"maxLength": 2, "minLength": 2, "type": "string"} # bills & votes self._schema["bill"]["properties"]["session"]["enum"] = self.all_sessions() self._schema["vote"]["properties"]["session"]["enum"] = self.all_sessions() # legislators terms = [t["name"] for t in self.metadata["terms"]] self._schema["person"]["properties"]["roles"]["items"]["properties"]["term"]["enum"] = terms @property def object_count(self): # number of distinct output filenames return len(self.output_names) def validate_json(self, obj): try: self.validator.validate(obj, self._schema[obj["_type"]]) except ValueError as ve: self.warning(str(ve)) if self.strict_validation: raise ve def all_sessions(self): sessions = [] for t in self.metadata["terms"]: sessions.extend(t["sessions"]) return sessions def validate_session(self, session, latest_only=False): """ Check that a session is present in the metadata dictionary. raises :exc:`~billy.scrape.NoDataForPeriod` if session is invalid :param session: string representing session to check """ if latest_only: if session != self.metadata["terms"][-1]["sessions"][-1]: raise NoDataForPeriod(session) for t in self.metadata["terms"]: if session in t["sessions"]: return True raise NoDataForPeriod(session) def validate_term(self, term, latest_only=False): """ Check that a term is present in the metadata dictionary. raises :exc:`~billy.scrape.NoDataForPeriod` if term is invalid :param term: string representing term to check :param latest_only: if True, will raise exception if term is not the current term (default: False) """ if latest_only: if term == self.metadata["terms"][-1]["name"]: return True else: raise NoDataForPeriod(term) for t in self.metadata["terms"]: if term == t["name"]: return True raise NoDataForPeriod(term) def save_object(self, obj): self.log("save %s %s", obj["_type"], unicode(obj)) # copy over LEVEL_FIELD obj[settings.LEVEL_FIELD] = getattr(self, settings.LEVEL_FIELD) filename = obj.get_filename() self.output_names.add(filename) # keep tally of all output names # pluralize type if obj["_type"] == "speech": data_dir = "speeches" elif obj["_type"] == "person": data_dir = "legislators" else: data_dir = obj["_type"] + "s" with open(os.path.join(self.output_dir, data_dir, filename), "w") as f: json.dump(obj, f, cls=JSONEncoderPlus) # validate after writing, allows for inspection self.validate_json(obj)
def main(): try: parser = argparse.ArgumentParser( description='update billy data', parents=[base_arg_parser], ) what = parser.add_argument_group( 'what to scrape', 'flags that help select what data to scrape') scrape = parser.add_argument_group('scraper config', 'settings for the scraper') parser.add_argument('module', type=str, help='scraper module (eg. nc)') parser.add_argument('--pdb', action='store_true', default=False, help='invoke PDB when exception is raised') parser.add_argument('--ipdb', action='store_true', default=False, help='invoke PDB when exception is raised') parser.add_argument('--pudb', action='store_true', default=False, help='invoke PUDB when exception is raised') what.add_argument('-s', '--session', action='append', dest='sessions', default=[], help='session(s) to scrape') what.add_argument('-t', '--term', action='append', dest='terms', help='term(s) to scrape', default=[]) for arg in ('upper', 'lower'): what.add_argument('--' + arg, action='append_const', dest='chambers', const=arg) for arg in ('bills', 'legislators', 'committees', 'votes', 'events', 'speeches'): what.add_argument('--' + arg, action='append_const', dest='types', const=arg) for arg in ('scrape', 'import', 'report', 'session-list'): parser.add_argument('--' + arg, dest='actions', action="append_const", const=arg, help='only run %s step' % arg) # special modes for debugging scrape.add_argument('--nonstrict', action='store_false', dest='strict', default=True, help="don't fail immediately when" " encountering validation warning") scrape.add_argument('--fastmode', help="scrape in fast mode", action="store_true", default=False) # scrapelib overrides scrape.add_argument('-r', '--rpm', action='store', type=int, dest='SCRAPELIB_RPM') scrape.add_argument('--timeout', action='store', type=int, dest='SCRAPELIB_TIMEOUT') scrape.add_argument('--retries', type=int, dest='SCRAPELIB_RETRY_ATTEMPTS') scrape.add_argument('--retry_wait', type=int, dest='SCRAPELIB_RETRY_WAIT_SECONDS') args = parser.parse_args() if args.pdb or args.pudb or args.ipdb: _debugger = pdb if args.pudb: try: import pudb _debugger = pudb except ImportError: pass if args.ipdb: try: import ipdb _debugger = ipdb except ImportError: pass # turn on PDB-on-error mode # stolen from http://stackoverflow.com/questions/1237379/ # if this causes problems in interactive mode check that page def _tb_info(type, value, tb): traceback.print_exception(type, value, tb) _debugger.pm() sys.excepthook = _tb_info # inject scraper paths so scraper module can be found for newpath in settings.SCRAPER_PATHS: sys.path.insert(0, newpath) # get metadata module = importlib.import_module(args.module) metadata = module.metadata module_settings = getattr(module, 'settings', {}) abbrev = metadata['abbreviation'] # load module settings, then command line settings settings.update(module_settings) settings.update(args) # make output dir args.output_dir = os.path.join(settings.BILLY_DATA_DIR, abbrev) # if terms aren't set, use latest if not args.terms: if args.sessions: for session in args.sessions: args.terms.append( term_for_session(metadata['abbreviation'], session, metadata)) args.terms = list(set(args.terms or [])) else: latest_term = metadata['terms'][-1]['name'] args.terms = [latest_term] # only set sessions from terms if sessions weren't set elif not args.sessions: for term in metadata['terms']: if term['name'] in args.terms: args.sessions.extend(term['sessions']) # dedup sessions args.sessions = list(set(args.sessions or [])) if not args.sessions: args.sessions = [metadata['terms'][-1]['sessions'][-1]] # determine chambers if not args.chambers: args.chambers = ['upper', 'lower'] if not args.actions: args.actions = ['scrape', 'import', 'report'] if not args.types: args.types = ['bills', 'legislators', 'votes', 'committees', 'alldata'] if 'events' in metadata['feature_flags']: args.types.append('events') if 'speeches' in metadata['feature_flags']: args.types.append('speeches') plan = """billy-update abbr=%s actions=%s types=%s sessions=%s terms=%s""" % (args.module, ','.join(args.actions), ','.join(args.types), ','.join(args.sessions), ','.join(args.terms)) logging.getLogger('billy').info(plan) scrape_data = {} if 'scrape' in args.actions: _clear_scraped_data(args.output_dir) # validate then write metadata if hasattr(module, 'session_list'): session_list = module.session_list() else: session_list = [] check_sessions(metadata, session_list) try: schema_path = os.path.join(os.path.split(__file__)[0], '../schemas/metadata.json') schema = json.load(open(schema_path)) validator = DatetimeValidator() validator.validate(metadata, schema) except ValueError as e: logging.getLogger('billy').warning( 'metadata validation error: ' + str(e)) run_record = [] exec_record = { "run_record": run_record, "args": sys.argv, } lex = None exc_traceback = None # start to run scrapers exec_start = dt.datetime.utcnow() # scraper order matters order = ('legislators', 'committees', 'votes', 'bills', 'events', 'speeches') _traceback = None try: for stype in order: if stype in args.types: run_record += _run_scraper(stype, args, metadata) except Exception as e: _traceback = _, _, exc_traceback = sys.exc_info() run_record += [{"exception": e, "type": stype}] lex = e exec_end = dt.datetime.utcnow() exec_record['started'] = exec_start exec_record['ended'] = exec_end scrape_data['scraped'] = exec_record scrape_data['abbr'] = abbrev for record in run_record: if "exception" in record: ex = record['exception'] fb = traceback.format_exception(*_traceback) trace = "" for t in fb: trace += t record['exception'] = { "type": ex.__class__.__name__, "message": ex.message, 'traceback': trace } scrape_data['failure'] = True if lex: if 'import' in args.actions: try: db.billy_runs.save(scrape_data, safe=True) except Exception: raise lex, None, exc_traceback # XXX: This should *NEVER* happen, but it has # in the past, so we're going to catch any errors # writing # to pymongo, and raise the original # exception rather then let it look like Mongo's fault. # Thanks for catching this, Thom. # # We lose the stack trace, but the Exception is the # same in every other way. # -- paultag raise # imports if 'import' in args.actions: import_report = _do_imports(abbrev, args) scrape_data['imported'] = import_report # We're tying the run-logging into the import stage - since import # already writes to the DB, we might as well throw this in too. db.billy_runs.save(scrape_data, safe=True) # reports if 'report' in args.actions: _do_reports(abbrev, args) if 'session-list' in args.actions: if hasattr(module, 'session_list'): print("\n".join(module.session_list())) else: raise ScrapeError('session_list() is not defined') except ScrapeError as e: logging.getLogger('billy').critical('Error: %s', e) sys.exit(1)
class Scraper(scrapelib.Scraper): """ Base class for all Scrapers Provides several useful methods for retrieving URLs and checking arguments against metadata. """ latest_only = False def __init__(self, metadata, output_dir=None, strict_validation=None, fastmode=False): """ Create a new Scraper instance. :param metadata: metadata for this scraper :param output_dir: the data directory to use :param strict_validation: exit immediately if validation fails """ super(Scraper, self).__init__() # scrapelib overrides self.timeout = settings.SCRAPELIB_TIMEOUT self.cache_storage = scrapelib.FileCache(settings.BILLY_CACHE_DIR) self.requests_per_minute = settings.SCRAPELIB_RPM self.retry_attempts = settings.SCRAPELIB_RETRY_ATTEMPTS self.retry_wait_seconds = settings.SCRAPELIB_RETRY_WAIT_SECONDS if fastmode: self.requests_per_minute = 0 self.cache_write_only = False self.metadata = metadata self.output_dir = output_dir self.output_names = set() # make output_dir os.path.isdir(self.output_dir) or os.path.makedirs(self.output_dir) # validation self.strict_validation = strict_validation self.validator = DatetimeValidator() self._schema = {} self._load_schemas() # logging convenience methods self.logger = logging.getLogger("billy") self.log = self.logger.info self.info = self.logger.info self.debug = self.logger.debug self.warning = self.logger.warning self.error = self.logger.error self.critical = self.logger.critical def _load_schemas(self): """ load all schemas into schema dict """ types = ('bill', 'committee', 'person', 'vote', 'event', 'speech') for type in types: schema_path = os.path.join(os.path.split(__file__)[0], '../schemas/%s.json' % type) self._schema[type] = json.load(open(schema_path)) self._schema[type]['properties'][settings.LEVEL_FIELD] = { 'minLength': 2, 'type': 'string'} # bills & votes self._schema['bill']['properties']['session']['enum'] = \ self.all_sessions() self._schema['vote']['properties']['session']['enum'] = \ self.all_sessions() # legislators terms = [t['name'] for t in self.metadata['terms']] # ugly break here b/c this line is nearly impossible to split self._schema['person']['properties']['roles'][ 'items']['properties']['term']['enum'] = terms @property def object_count(self): # number of distinct output filenames return len(self.output_names) def validate_json(self, obj): try: self.validator.validate(obj, self._schema[obj['_type']]) except ValueError as ve: self.warning(str(ve)) if self.strict_validation: raise ve def all_sessions(self): sessions = [] for t in self.metadata['terms']: sessions.extend(t['sessions']) return sessions def validate_session(self, session, latest_only=False): """ Check that a session is present in the metadata dictionary. raises :exc:`~billy.scrape.NoDataForPeriod` if session is invalid :param session: string representing session to check """ if latest_only: if session != self.metadata['terms'][-1]['sessions'][-1]: raise NoDataForPeriod(session) for t in self.metadata['terms']: if session in t['sessions']: return True raise NoDataForPeriod(session) def validate_term(self, term, latest_only=False): """ Check that a term is present in the metadata dictionary. raises :exc:`~billy.scrape.NoDataForPeriod` if term is invalid :param term: string representing term to check :param latest_only: if True, will raise exception if term is not the current term (default: False) """ if latest_only: if term == self.metadata['terms'][-1]['name']: return True else: raise NoDataForPeriod(term) for t in self.metadata['terms']: if term == t['name']: return True raise NoDataForPeriod(term) def save_object(self, obj): self.log('save %s %s', obj['_type'], unicode(obj)) # copy jurisdiction to LEVEL_FIELD obj[settings.LEVEL_FIELD] = getattr(self, 'jurisdiction') filename = obj.get_filename() self.output_names.add(filename) # keep tally of all output names # pluralize type if obj['_type'] == 'speech': data_dir = 'speeches' elif obj['_type'] == 'person': data_dir = 'legislators' else: data_dir = obj['_type'] + 's' with open(os.path.join(self.output_dir, data_dir, filename), 'w') as f: json.dump(obj, f, cls=JSONEncoderPlus) # validate after writing, allows for inspection self.validate_json(obj)
class Scraper(scrapelib.Scraper): """ Base class for all Scrapers Provides several useful methods for retrieving URLs and checking arguments against metadata. """ __metaclass__ = ScraperMeta def __init__(self, metadata, no_cache=False, output_dir=None, strict_validation=None, **kwargs): """ Create a new Scraper instance. :param metadata: metadata for this state :param no_cache: if True, will ignore any cached downloads :param output_dir: the data directory to use :param strict_validation: exit immediately if validation fails """ # configure underlying scrapelib object if no_cache: kwargs['cache_dir'] = None elif 'cache_dir' not in kwargs: kwargs['cache_dir'] = settings.BILLY_CACHE_DIR if 'error_dir' not in kwargs: kwargs['error_dir'] = settings.BILLY_ERROR_DIR if 'timeout' not in kwargs: kwargs['timeout'] = settings.SCRAPELIB_TIMEOUT if 'requests_per_minute' not in kwargs: kwargs['requests_per_minute'] = None if 'retry_attempts' not in kwargs: kwargs['retry_attempts'] = settings.SCRAPELIB_RETRY_ATTEMPTS if 'retry_wait_seconds' not in kwargs: kwargs[ 'retry_wait_seconds'] = settings.SCRAPELIB_RETRY_WAIT_SECONDS super(Scraper, self).__init__(**kwargs) if not hasattr(self, 'state'): raise Exception('Scrapers must have a state attribute') self.metadata = metadata self.output_dir = output_dir # validation self.strict_validation = strict_validation self.validator = DatetimeValidator() self.follow_robots = False # logging convenience methods self.logger = logging.getLogger("billy") self.log = self.logger.info self.debug = self.logger.debug self.warning = self.logger.warning def validate_json(self, obj): if not hasattr(self, '_schema'): self._schema = self._get_schema() try: self.validator.validate(obj, self._schema) except ValueError as ve: self.warning(str(ve)) if self.strict_validation: raise ve def all_sessions(self): sessions = [] for t in self.metadata['terms']: sessions.extend(t['sessions']) return sessions def validate_session(self, session): """ Check that a session is present in the metadata dictionary. raises :exc:`~billy.scrape.NoDataForPeriod` if session is invalid :param session: string representing session to check """ for t in self.metadata['terms']: if session in t['sessions']: return True raise NoDataForPeriod(session) def validate_term(self, term, latest_only=False): """ Check that a term is present in the metadata dictionary. raises :exc:`~billy.scrape.NoDataForPeriod` if term is invalid :param term: string representing term to check :param latest_only: if True, will raise exception if term is not the current term (default: False) """ if latest_only: if term == self.metadata['terms'][-1]['name']: return True else: raise NoDataForPeriod(term) for t in self.metadata['terms']: if term == t['name']: return True raise NoDataForPeriod(term)
# make output dir args.output_dir = os.path.join(settings.BILLY_DATA_DIR, state) try: os.makedirs(args.output_dir) except OSError, e: if e.errno != 17: raise e # write metadata try: schema_path = os.path.join(os.path.split(__file__)[0], '../schemas/metadata.json') schema = json.load(open(schema_path)) validator = DatetimeValidator() validator.validate(metadata, schema) except ValueError, e: logging.getLogger('billy').warning('metadata validation error: ' + str(e)) with open(os.path.join(args.output_dir, 'state_metadata.json'), 'w') as f: json.dump(metadata, f, cls=JSONDateEncoder) # determine time period to run for if args.terms: for term in metadata['terms']: if term in args.terms: args.sessions.extend(term['sessions']) args.sessions = set(args.sessions or [])
class Scraper(scrapelib.Scraper): """ Base class for all Scrapers Provides several useful methods for retrieving URLs and checking arguments against metadata. """ __metaclass__ = ScraperMeta def __init__(self, metadata, no_cache=False, output_dir=None, strict_validation=None, **kwargs): """ Create a new Scraper instance. :param metadata: metadata for this state :param no_cache: if True, will ignore any cached downloads :param output_dir: the data directory to use :param strict_validation: exit immediately if validation fails """ # configure underlying scrapelib object if no_cache: kwargs['cache_dir'] = None elif 'cache_dir' not in kwargs: kwargs['cache_dir'] = settings.BILLY_CACHE_DIR if 'error_dir' not in kwargs: kwargs['error_dir'] = settings.BILLY_ERROR_DIR if 'timeout' not in kwargs: kwargs['timeout'] = settings.SCRAPELIB_TIMEOUT if 'requests_per_minute' not in kwargs: kwargs['requests_per_minute'] = None if 'retry_attempts' not in kwargs: kwargs['retry_attempts'] = settings.SCRAPELIB_RETRY_ATTEMPTS if 'retry_wait_seconds' not in kwargs: kwargs['retry_wait_seconds'] = settings.SCRAPELIB_RETRY_WAIT_SECONDS super(Scraper, self).__init__(**kwargs) if not hasattr(self, 'state'): raise Exception('Scrapers must have a state attribute') self.metadata = metadata self.output_dir = output_dir # validation self.strict_validation = strict_validation self.validator = DatetimeValidator() self.follow_robots = False # logging convenience methods self.logger = logging.getLogger("billy") self.log = self.logger.info self.debug = self.logger.debug self.warning = self.logger.warning def validate_json(self, obj): if not hasattr(self, '_schema'): self._schema = self._get_schema() try: self.validator.validate(obj, self._schema) except ValueError as ve: self.warning(str(ve)) if self.strict_validation: raise ve def all_sessions(self): sessions = [] for t in self.metadata['terms']: sessions.extend(t['sessions']) return sessions def validate_session(self, session): """ Check that a session is present in the metadata dictionary. raises :exc:`~billy.scrape.NoDataForPeriod` if session is invalid :param session: string representing session to check """ for t in self.metadata['terms']: if session in t['sessions']: return True raise NoDataForPeriod(session) def validate_term(self, term, latest_only=False): """ Check that a term is present in the metadata dictionary. raises :exc:`~billy.scrape.NoDataForPeriod` if term is invalid :param term: string representing term to check :param latest_only: if True, will raise exception if term is not the current term (default: False) """ if latest_only: if term == self.metadata['terms'][-1]['name']: return True else: raise NoDataForPeriod(term) for t in self.metadata['terms']: if term == t['name']: return True raise NoDataForPeriod(term)
def main(old_scrape_compat=False): try: parser = argparse.ArgumentParser( description='update billy data', parents=[base_arg_parser], ) what = parser.add_argument_group( 'what to scrape', 'flags that help select what data to scrape') scrape = parser.add_argument_group( 'scraper config', 'settings for the scraper') parser.add_argument('module', type=str, help='scraper module (eg. nc)') parser.add_argument('--pdb', action='store_true', default=False, help='invoke PDB when exception is raised') parser.add_argument('--ipdb', action='store_true', default=False, help='invoke PDB when exception is raised') parser.add_argument('--pudb', action='store_true', default=False, help='invoke PUDB when exception is raised') what.add_argument('-s', '--session', action='append', dest='sessions', default=[], help='session(s) to scrape') what.add_argument('-t', '--term', action='append', dest='terms', help='term(s) to scrape', default=[]) for arg in ('upper', 'lower'): what.add_argument('--' + arg, action='append_const', dest='chambers', const=arg) for arg in ( 'bills', 'legislators', 'committees', 'votes', 'events', 'speeches'): what.add_argument( '--' + arg, action='append_const', dest='types', const=arg) for arg in ('scrape', 'import', 'report'): parser.add_argument('--' + arg, dest='actions', action="append_const", const=arg, help='only run %s step' % arg) # special modes for debugging scrape.add_argument('--nonstrict', action='store_false', dest='strict', default=True, help="don't fail immediately when" " encountering validation warning") scrape.add_argument('--fastmode', help="scrape in fast mode", action="store_true", default=False) scrape.add_argument('--billid', help="scrape only a single bill", action="store", default=False) # scrapelib overrides scrape.add_argument('-r', '--rpm', action='store', type=int, dest='SCRAPELIB_RPM') scrape.add_argument('--timeout', action='store', type=int, dest='SCRAPELIB_TIMEOUT') scrape.add_argument('--retries', type=int, dest='SCRAPELIB_RETRY_ATTEMPTS') scrape.add_argument('--retry_wait', type=int, dest='SCRAPELIB_RETRY_WAIT_SECONDS') args = parser.parse_args() if args.pdb or args.pudb or args.ipdb: _debugger = pdb if args.pudb: try: import pudb _debugger = pudb except ImportError: pass if args.ipdb: try: import ipdb _debugger = ipdb except ImportError: pass # turn on PDB-on-error mode # stolen from http://stackoverflow.com/questions/1237379/ # if this causes problems in interactive mode check that page def _tb_info(_type, value, tb): traceback.print_exception(_type, value, tb) _debugger.pm() sys.excepthook = _tb_info # inject scraper paths so scraper module can be found for newpath in settings.SCRAPER_PATHS: sys.path.insert(0, newpath) # get metadata module = importlib.import_module(args.module) metadata = module.metadata module_settings = getattr(module, 'settings', {}) abbrev = metadata['abbreviation'] # load module settings, then command line settings settings.update(module_settings) settings.update(args) # make output dir args.output_dir = os.path.join(settings.BILLY_DATA_DIR, abbrev) # if terms aren't set, use latest if not args.terms: if args.sessions: for session in args.sessions: args.terms.append( term_for_session(metadata['abbreviation'], session, metadata)) args.terms = list(set(args.terms or [])) else: latest_term = metadata['terms'][-1]['name'] args.terms = [latest_term] # only set sessions from terms if sessions weren't set elif not args.sessions: for term in metadata['terms']: if term['name'] in args.terms: args.sessions.extend(term['sessions']) # dedup sessions args.sessions = list(set(args.sessions or [])) if not args.sessions: args.sessions = [metadata['terms'][-1]['sessions'][-1]] # determine chambers if not args.chambers: args.chambers = ['upper', 'lower'] if not args.actions: args.actions = ['scrape', 'import', 'report'] if not args.types: args.types = ['bills', 'legislators', 'votes', 'committees', 'alldata'] if 'events' in metadata['feature_flags']: args.types.append('events') if 'speeches' in metadata['feature_flags']: args.types.append('speeches') plan = """billy-update abbr=%s actions=%s types=%s sessions=%s terms=%s""" % (args.module, ','.join(args.actions), ','.join(args.types), ','.join(args.sessions), ','.join(args.terms)) _log.info(plan) scrape_data = {} if args.billid is False: _log.debug("No billid filter.") else: _log.debug("Search for billid: %s" % args.billid) if 'scrape' in args.actions: _clear_scraped_data(args.output_dir) # validate then write metadata if hasattr(module, 'session_list'): session_list = module.session_list() else: session_list = [] check_sessions(metadata, session_list) _log.debug("Session List %s" % session_list) try: schema_path = os.path.join( os.path.split(__file__)[0], '../schemas/metadata.json') schema = json.load(open(schema_path)) validator = DatetimeValidator() validator.validate(metadata, schema) except ValueError as e: _log.warning( 'metadata validation error: ' + str(e)) with open(os.path.join(args.output_dir, 'metadata.json'), 'w') as f: json.dump(metadata, f, cls=JSONDateEncoder) run_record = [] exec_record = { "run_record": run_record, "args": sys.argv, "state": abbrev } lex = None exc_traceback = None # start to run scrapers exec_start = dt.datetime.utcnow() # scraper order matters if args.billid is False: order = ( 'legislators', 'committees', 'votes', 'bills', 'events', 'speeches') else: _log.debug("going to process bills") order = ('bills',) # only process the bills _traceback = None try: for stype in order: _log.debug("consider to process %s" % stype) if stype in args.types: _log.debug("going to process %s" % stype) scraper_results = _run_scraper(stype, args, metadata) run_record += scraper_results else: _log.debug("skipping %s" % stype) except Exception as e: _traceback = _, _, exc_traceback = sys.exc_info() run_record += [{"exception": e, "type": stype}] lex = e exec_end = dt.datetime.utcnow() exec_record['started'] = exec_start exec_record['ended'] = exec_end scrape_data['scraped'] = exec_record scrape_data['abbr'] = abbrev for record in run_record: if "exception" in record: ex = record['exception'] fb = traceback.format_exception(*_traceback) trace = "" for t in fb: trace += t record['exception'] = { "type": ex.__class__.__name__, "message": ex, 'traceback': trace } scrape_data['failure'] = True if lex: if 'import' in args.actions: try: _log.debug("scrape_data:") if scrape_data['failure']: _log.debug("Failed") _log.debug(scrape_data) else: _log.debug("OK") _log.debug(scrape_data) db.billy_runs.save(scrape_data, safe=True) except KeyError as e: _log.debug("Caught exception1 :") _log.debug(e) exit(123) except pymongo.errors.OperationFailure as e: _log.debug("Caught exception3 :") _log.debug(e) exit(123) except Exception as e: _log.debug("Caught exception :") _log.debug(e) exit(123) raise lex, None, exc_traceback # XXX: This should *NEVER* happen, but it has # in the past, so we're going to catch any errors # writing # to pymongo, and raise the original # exception rather then let it look like Mongo's fault. # Thanks for catching this, Thom. # # We lose the stack trace, but the Exception is the # same in every other way. # -- paultag raise # imports if 'import' in args.actions: import_report = _do_imports(abbrev, args) scrape_data['imported'] = import_report # We're tying the run-logging into the import stage - since import # already writes to the DB, we might as well throw this in too. _log.debug(scrape_data) db.billy_runs.save(scrape_data, safe=True) # reports if 'report' in args.actions: _do_reports(abbrev, args) if 'session-list' in args.actions: if hasattr(module, 'session_list'): print("\n".join(module.session_list())) else: raise ScrapeError('session_list() is not defined') except ScrapeError as e: _log.debug("in update.py Scrape error") _log.debug("Scrape error :%s" % e) _log.critical('Error: %s' % e) sys.exit(1) except TypeError as e: _log.debug("Type error") _log.critical('TypeError:', e) sys.exit(1) except NoData as e: _log.debug("No Data") _log.debug(e) _log.critical('No Data:') sys.exit(1) except NoDoc as e: _log.debug("No Doc") _log.critical('No Doc:', e) sys.exit(1) except NoXpath as e: _log.debug("No XPath") _log.critical('No XPath:', e) sys.exit(1) except Exception as e: _log.debug("Unknown error3") _log.debug(e) _log.critical('Unknown Error') sys.exit(1)
class Scraper(scrapelib.Scraper): """ Base class for all Scrapers Provides several useful methods for retrieving URLs and checking arguments against metadata. """ __metaclass__ = ScraperMeta def __init__(self, metadata, no_cache=False, output_dir=None, strict_validation=None, **kwargs): """ Create a new Scraper instance. :param metadata: metadata for this state :param no_cache: if True, will ignore any cached downloads :param output_dir: the data directory to use :param strict_validation: exit immediately if validation fails """ # configure underlying scrapelib object if no_cache: kwargs['cache_dir'] = None elif 'cache_dir' not in kwargs: kwargs['cache_dir'] = settings.BILLY_CACHE_DIR if 'error_dir' not in kwargs: kwargs['error_dir'] = settings.BILLY_ERROR_DIR if 'timeout' not in kwargs: kwargs['timeout'] = settings.SCRAPELIB_TIMEOUT if 'requests_per_minute' not in kwargs: kwargs['requests_per_minute'] = None if 'retry_attempts' not in kwargs: kwargs['retry_attempts'] = settings.SCRAPELIB_RETRY_ATTEMPTS if 'retry_wait_seconds' not in kwargs: kwargs[ 'retry_wait_seconds'] = settings.SCRAPELIB_RETRY_WAIT_SECONDS super(Scraper, self).__init__(**kwargs) if not hasattr(self, 'state'): raise Exception('Scrapers must have a state attribute') self.metadata = metadata self.output_dir = output_dir # validation self.strict_validation = strict_validation self.validator = DatetimeValidator() self.follow_robots = False # logging convenience methods self.logger = logging.getLogger("billy") self.log = self.logger.info self.debug = self.logger.debug self.warning = self.logger.warning def validate_json(self, obj): if not hasattr(self, '_schema'): self._schema = self._get_schema() try: self.validator.validate(obj, self._schema) except ValueError, ve: self.warning(str(ve)) if self.strict_validation: raise ve
class Scraper(scrapelib.Scraper): """ Base class for all Scrapers Provides several useful methods for retrieving URLs and checking arguments against metadata. """ __metaclass__ = ScraperMeta latest_only = False def __init__(self, metadata, output_dir=None, strict_validation=None, fastmode=False, **kwargs): """ Create a new Scraper instance. :param metadata: metadata for this scraper :param output_dir: the data directory to use :param strict_validation: exit immediately if validation fails """ # configure underlying scrapelib object kwargs['cache_obj'] = scrapelib.FileCache(settings.BILLY_CACHE_DIR) kwargs['requests_per_minute'] = settings.SCRAPELIB_RPM kwargs['timeout'] = settings.SCRAPELIB_TIMEOUT kwargs['retry_attempts'] = settings.SCRAPELIB_RETRY_ATTEMPTS kwargs['retry_wait_seconds'] = settings.SCRAPELIB_RETRY_WAIT_SECONDS if fastmode: kwargs['requests_per_minute'] = 0 kwargs['cache_write_only'] = False super(Scraper, self).__init__(**kwargs) for f in settings.BILLY_LEVEL_FIELDS[self.level]: if not hasattr(self, f): raise Exception('%s scrapers must have a %s attribute' % (self.level, f)) self.metadata = metadata self.output_dir = output_dir # make output_dir os.path.isdir(self.output_dir) or os.path.makedirs(self.output_dir) # validation self.strict_validation = strict_validation self.validator = DatetimeValidator() self.follow_robots = False # logging convenience methods self.logger = logging.getLogger("billy") self.log = self.logger.info self.debug = self.logger.debug self.warning = self.logger.warning def validate_json(self, obj): if not hasattr(self, '_schema'): self._schema = self._get_schema() try: self.validator.validate(obj, self._schema) except ValueError as ve: self.warning(str(ve)) if self.strict_validation: raise ve def all_sessions(self): sessions = [] for t in self.metadata['terms']: sessions.extend(t['sessions']) return sessions def validate_session(self, session): """ Check that a session is present in the metadata dictionary. raises :exc:`~billy.scrape.NoDataForPeriod` if session is invalid :param session: string representing session to check """ for t in self.metadata['terms']: if session in t['sessions']: return True raise NoDataForPeriod(session) def validate_term(self, term, latest_only=False): """ Check that a term is present in the metadata dictionary. raises :exc:`~billy.scrape.NoDataForPeriod` if term is invalid :param term: string representing term to check :param latest_only: if True, will raise exception if term is not the current term (default: False) """ if latest_only: if term == self.metadata['terms'][-1]['name']: return True else: raise NoDataForPeriod(term) for t in self.metadata['terms']: if term == t['name']: return True raise NoDataForPeriod(term) def save_object(self, obj): # copy over level information obj['level'] = self.level for f in settings.BILLY_LEVEL_FIELDS[self.level]: obj[f] = getattr(self, f) filename = obj.get_filename() with open(os.path.join(self.output_dir, self.scraper_type, filename), 'w') as f: json.dump(obj, f, cls=JSONDateEncoder) # validate after writing, allows for inspection self.validate_json(obj)
def main(): try: parser = argparse.ArgumentParser( description='update billy data', parents=[base_arg_parser], ) what = parser.add_argument_group( 'what to scrape', 'flags that help select what data to scrape') scrape = parser.add_argument_group('scraper config', 'settings for the scraper') parser.add_argument('module', type=str, help='scraper module (eg. nc)') what.add_argument('-s', '--session', action='append', dest='sessions', default=[], help='session(s) to scrape') what.add_argument('-t', '--term', action='append', dest='terms', help='term(s) to scrape', default=[]) for arg in ('upper', 'lower'): what.add_argument('--' + arg, action='append_const', dest='chambers', const=arg) for arg in ('bills', 'legislators', 'committees', 'votes', 'events', 'speeches'): what.add_argument('--' + arg, action='append_const', dest='types', const=arg) for arg in ('scrape', 'import', 'report', 'session-list'): parser.add_argument('--' + arg, dest='actions', action="append_const", const=arg, help='only run %s step' % arg) # special modes for debugging scrape.add_argument('--nonstrict', action='store_false', dest='strict', default=True, help="don't fail immediately when" " encountering validation warning") scrape.add_argument('--fastmode', help="scrape in fast mode", action="store_true", default=False) # scrapelib overrides scrape.add_argument('-r', '--rpm', action='store', type=int, dest='SCRAPELIB_RPM') scrape.add_argument('--timeout', action='store', type=int, dest='SCRAPELIB_TIMEOUT') scrape.add_argument('--retries', type=int, dest='SCRAPELIB_RETRY_ATTEMPTS') scrape.add_argument('--retry_wait', type=int, dest='SCRAPELIB_RETRY_WAIT_SECONDS') args = parser.parse_args() # inject scraper paths so scraper module can be found for newpath in settings.SCRAPER_PATHS: sys.path.insert(0, newpath) # get metadata module = importlib.import_module(args.module) metadata = module.metadata module_settings = getattr(module, 'settings', {}) abbrev = metadata['abbreviation'] # load module settings, then command line settings settings.update(module_settings) settings.update(args) # make output dir args.output_dir = os.path.join(settings.BILLY_DATA_DIR, abbrev) # if terms aren't set, use latest if not args.terms: if args.sessions: for session in args.sessions: args.terms.append( term_for_session(metadata['abbreviation'], session, metadata)) args.terms = list(set(args.terms or [])) else: latest_term = metadata['terms'][-1]['name'] args.terms = [latest_term] # only set sessions from terms if sessions weren't set elif not args.sessions: for term in metadata['terms']: if term['name'] in args.terms: args.sessions.extend(term['sessions']) # dedup sessions args.sessions = list(set(args.sessions or [])) if not args.sessions: args.sessions = [metadata['terms'][-1]['sessions'][-1]] # determine chambers if not args.chambers: args.chambers = ['upper', 'lower'] if not args.actions: args.actions = ['scrape', 'import', 'report'] if not args.types: args.types = ['bills', 'legislators', 'votes', 'committees', 'alldata'] if 'events' in metadata['feature_flags']: args.types.append('events') if 'speeches' in metadata['feature_flags']: args.types.append('speeches') plan = """billy-update abbr=%s actions=%s types=%s sessions=%s terms=%s""" % (args.module, ','.join(args.actions), ','.join(args.types), ','.join(args.sessions), ','.join(args.terms)) logging.getLogger('billy').info(plan) scrape_data = {} if 'scrape' in args.actions: _clear_scraped_data(args.output_dir) # validate then write metadata if hasattr(module, 'session_list'): session_list = module.session_list() else: session_list = [] check_sessions(metadata, session_list) try: schema_path = os.path.join(os.path.split(__file__)[0], '../schemas/metadata.json') schema = json.load(open(schema_path)) validator = DatetimeValidator() validator.validate(metadata, schema) except ValueError as e: logging.getLogger('billy').warning( 'metadata validation error: ' + str(e)) run_record = [] exec_record = { "run_record": run_record, "args": sys.argv, } lex = None exc_traceback = None # start to run scrapers exec_start = dt.datetime.utcnow() # scraper order matters order = ('legislators', 'committees', 'votes', 'bills', 'events', 'speeches') _traceback = None try: for stype in order: if stype in args.types: run_record += _run_scraper(stype, args, metadata) except Exception as e: _traceback = _, _, exc_traceback = sys.exc_info() run_record += [{"exception": e, "type": stype}] lex = e exec_end = dt.datetime.utcnow() exec_record['started'] = exec_start exec_record['ended'] = exec_end scrape_data['scraped'] = exec_record scrape_data['abbr'] = abbrev for record in run_record: if "exception" in record: ex = record['exception'] fb = traceback.format_exception(*_traceback) trace = "" for t in fb: trace += t record['exception'] = { "type": ex.__class__.__name__, "message": ex.message, 'traceback': trace } scrape_data['failure'] = True if lex: if 'import' in args.actions: try: db.billy_runs.save(scrape_data, safe=True) except Exception: raise lex, None, exc_traceback # XXX: This should *NEVER* happen, but it has # in the past, so we're going to catch any errors # writing # to pymongo, and raise the original # exception rather then let it look like Mongo's fault. # Thanks for catching this, Thom. # # We lose the stack trace, but the Exception is the # same in every other way. # -- paultag raise # imports if 'import' in args.actions: import_report = _do_imports(abbrev, args) scrape_data['imported'] = import_report # We're tying the run-logging into the import stage - since import # already writes to the DB, we might as well throw this in too. db.billy_runs.save(scrape_data, safe=True) # reports if 'report' in args.actions: _do_reports(abbrev, args) if 'session-list' in args.actions: if hasattr(module, 'session_list'): print("\n".join(module.session_list())) else: raise ScrapeError('session_list() is not defined') except ScrapeError as e: logging.getLogger('billy').critical('Error: %s', e) sys.exit(1)
# make output dir args.output_dir = os.path.join(settings.BILLY_DATA_DIR, state) try: os.makedirs(args.output_dir) except OSError, e: if e.errno != 17: raise e # write metadata try: schema_path = os.path.join( os.path.split(__file__)[0], '../schemas/metadata.json') schema = json.load(open(schema_path)) validator = DatetimeValidator() validator.validate(metadata, schema) except ValueError, e: logging.getLogger('billy').warning('metadata validation error: ' + str(e)) with open(os.path.join(args.output_dir, 'state_metadata.json'), 'w') as f: json.dump(metadata, f, cls=JSONDateEncoder) # determine time period to run for if args.terms: for term in metadata['terms']: if term in args.terms: args.sessions.extend(term['sessions']) args.sessions = set(args.sessions or [])
def main(): parser = argparse.ArgumentParser( description='Scrape data for state, saving data to disk.', parents=[base_arg_parser], ) parser.add_argument('state', type=str, help='state scraper module (eg. nc)') parser.add_argument('-s', '--session', action='append', dest='sessions', help='session(s) to scrape') parser.add_argument('-t', '--term', action='append', dest='terms', help='term(s) to scrape') parser.add_argument('--upper', action='store_true', dest='upper', default=False, help='scrape upper chamber') parser.add_argument('--lower', action='store_true', dest='lower', default=False, help='scrape lower chamber') parser.add_argument('--bills', action='store_true', dest='bills', default=False, help="scrape bill data") parser.add_argument('--legislators', action='store_true', dest='legislators', default=False, help="scrape legislator data") parser.add_argument('--committees', action='store_true', dest='committees', default=False, help="scrape committee data") parser.add_argument('--votes', action='store_true', dest='votes', default=False, help="scrape vote data") parser.add_argument('--events', action='store_true', dest='events', default=False, help='scrape event data') parser.add_argument('--alldata', action='store_true', dest='alldata', default=False, help="scrape all available types of data") parser.add_argument('--strict', action='store_true', dest='strict', default=False, help="fail immediately when" "encountering validation warning") parser.add_argument('-n', '--no_cache', action='store_true', dest='no_cache', help="don't use web page cache") parser.add_argument('--fastmode', help="scrape in fast mode", action="store_true", default=False) parser.add_argument('-r', '--rpm', action='store', type=int, dest='rpm', default=60), parser.add_argument('--timeout', action='store', type=int, dest='timeout', default=10) args = parser.parse_args() settings.update(args) # set up search path sys.path.insert(0, os.path.join(os.path.dirname(__file__), '../../openstates')) # get metadata metadata = __import__(args.state, fromlist=['metadata']).metadata state = metadata['abbreviation'] configure_logging(args.verbose, args.state) # make output dir args.output_dir = os.path.join(settings.BILLY_DATA_DIR, args.state) try: os.makedirs(args.output_dir) except OSError as e: if e.errno != 17: raise e # write metadata try: schema_path = os.path.join(os.path.split(__file__)[0], '../schemas/metadata.json') schema = json.load(open(schema_path)) validator = DatetimeValidator() validator.validate(metadata, schema) except ValueError as e: logging.getLogger('billy').warning('metadata validation error: ' + str(e)) with open(os.path.join(args.output_dir, 'state_metadata.json'), 'w') as f: json.dump(metadata, f, cls=JSONDateEncoder) # determine time period to run for if args.terms: for term in metadata['terms']: if term in args.terms: args.sessions.extend(term['sessions']) args.sessions = set(args.sessions or []) # determine chambers args.chambers = [] if args.upper: args.chambers.append('upper') if args.lower: args.chambers.append('lower') if not args.chambers: args.chambers = ['upper', 'lower'] if not (args.bills or args.legislators or args.votes or args.committees or args.events or args.alldata): raise ScrapeError("Must specify at least one of --bills, " "--legislators, --committees, --votes, --events, " "--alldata") if args.alldata: args.bills = True args.legislators = True args.votes = True args.committees = True if args.bills: _run_scraper(args.state, state, 'bills', args, metadata) if args.legislators: _run_scraper(args.state, state, 'legislators', args, metadata) if args.committees: _run_scraper(args.state, state, 'committees', args, metadata) if args.votes: _run_scraper(args.state, state, 'votes', args, metadata) if args.events: _run_scraper(args.state, state, 'events', args, metadata)
class Scraper(scrapelib.Scraper): """ Base class for all Scrapers Provides several useful methods for retrieving URLs and checking arguments against metadata. """ __metaclass__ = ScraperMeta def __init__(self, metadata, no_cache=False, output_dir=None, strict_validation=None, **kwargs): """ Create a new Scraper instance. :param metadata: metadata for this state :param no_cache: if True, will ignore any cached downloads :param output_dir: the data directory to use :param strict_validation: exit immediately if validation fails """ # configure underlying scrapelib object if no_cache: kwargs['cache_dir'] = None elif 'cache_dir' not in kwargs: kwargs['cache_dir'] = settings.BILLY_CACHE_DIR if 'error_dir' not in kwargs: kwargs['error_dir'] = settings.BILLY_ERROR_DIR if 'timeout' not in kwargs: kwargs['timeout'] = settings.SCRAPELIB_TIMEOUT if 'requests_per_minute' not in kwargs: kwargs['requests_per_minute'] = None if 'retry_attempts' not in kwargs: kwargs['retry_attempts'] = settings.SCRAPELIB_RETRY_ATTEMPTS if 'retry_wait_seconds' not in kwargs: kwargs['retry_wait_seconds'] = settings.SCRAPELIB_RETRY_WAIT_SECONDS super(Scraper, self).__init__(**kwargs) if not hasattr(self, 'state'): raise Exception('Scrapers must have a state attribute') self.metadata = metadata self.output_dir = output_dir # validation self.strict_validation = strict_validation self.validator = DatetimeValidator() self.follow_robots = False # logging convenience methods self.logger = logging.getLogger("billy") self.log = self.logger.info self.debug = self.logger.debug self.warning = self.logger.warning def validate_json(self, obj): if not hasattr(self, '_schema'): self._schema = self._get_schema() try: self.validator.validate(obj, self._schema) except ValueError, ve: self.warning(str(ve)) if self.strict_validation: raise ve
class Scraper(scrapelib.Scraper): """ Base class for all Scrapers Provides several useful methods for retrieving URLs and checking arguments against metadata. """ latest_only = False def __init__(self, metadata, output_dir=None, strict_validation=None, fastmode=False): """ Create a new Scraper instance. :param metadata: metadata for this scraper :param output_dir: the data directory to use :param strict_validation: exit immediately if validation fails """ super(Scraper, self).__init__() # scrapelib overrides self.timeout = settings.SCRAPELIB_TIMEOUT self.cache_storage = scrapelib.FileCache(settings.BILLY_CACHE_DIR) self.requests_per_minute = settings.SCRAPELIB_RPM self.retry_attempts = settings.SCRAPELIB_RETRY_ATTEMPTS self.retry_wait_seconds = settings.SCRAPELIB_RETRY_WAIT_SECONDS if fastmode: self.requests_per_minute = 0 self.cache_write_only = False self.metadata = metadata self.output_dir = output_dir self.output_names = set() # make output_dir os.path.isdir(self.output_dir) or os.path.makedirs(self.output_dir) # validation self.strict_validation = strict_validation self.validator = DatetimeValidator() self._schema = {} self._load_schemas() self.follow_robots = False # logging convenience methods self.logger = logging.getLogger("billy") self.log = self.logger.info self.info = self.logger.info self.debug = self.logger.debug self.warning = self.logger.warning self.error = self.logger.error self.critical = self.logger.critical def _load_schemas(self): """ load all schemas into schema dict """ types = ('bill', 'committee', 'person', 'vote', 'event', 'speech') for type in types: schema_path = os.path.join(os.path.split(__file__)[0], '../schemas/%s.json' % type) self._schema[type] = json.load(open(schema_path)) self._schema[type]['properties'][settings.LEVEL_FIELD] = { 'minLength': 2, 'type': 'string'} # bills & votes self._schema['bill']['properties']['session']['enum'] = \ self.all_sessions() self._schema['vote']['properties']['session']['enum'] = \ self.all_sessions() # legislators terms = [t['name'] for t in self.metadata['terms']] # ugly break here b/c this line is nearly impossible to split self._schema['person']['properties']['roles'][ 'items']['properties']['term']['enum'] = terms @property def object_count(self): # number of distinct output filenames return len(self.output_names) def validate_json(self, obj): try: self.validator.validate(obj, self._schema[obj['_type']]) except ValueError as ve: self.warning(str(ve)) if self.strict_validation: raise ve def all_sessions(self): sessions = [] for t in self.metadata['terms']: sessions.extend(t['sessions']) return sessions def validate_session(self, session, latest_only=False): """ Check that a session is present in the metadata dictionary. raises :exc:`~billy.scrape.NoDataForPeriod` if session is invalid :param session: string representing session to check """ if latest_only: if session != self.metadata['terms'][-1]['sessions'][-1]: raise NoDataForPeriod(session) for t in self.metadata['terms']: if session in t['sessions']: return True raise NoDataForPeriod(session) def validate_term(self, term, latest_only=False): """ Check that a term is present in the metadata dictionary. raises :exc:`~billy.scrape.NoDataForPeriod` if term is invalid :param term: string representing term to check :param latest_only: if True, will raise exception if term is not the current term (default: False) """ if latest_only: if term == self.metadata['terms'][-1]['name']: return True else: raise NoDataForPeriod(term) for t in self.metadata['terms']: if term == t['name']: return True raise NoDataForPeriod(term) def save_object(self, obj): self.log('save %s %s', obj['_type'], unicode(obj).encode('utf-8')) # copy jurisdiction to LEVEL_FIELD obj[settings.LEVEL_FIELD] = getattr(self, 'jurisdiction') filename = obj.get_filename() self.output_names.add(filename) # keep tally of all output names # pluralize type if obj['_type'] == 'speech': data_dir = 'speeches' elif obj['_type'] == 'person': data_dir = 'legislators' else: data_dir = obj['_type'] + 's' with open(os.path.join(self.output_dir, data_dir, filename), 'w') as f: json.dump(obj, f, cls=JSONEncoderPlus) # validate after writing, allows for inspection self.validate_json(obj)
class Scraper(scrapelib.Scraper): """ Base class for all Scrapers Provides several useful methods for retrieving URLs and checking arguments against metadata. """ __metaclass__ = ScraperMeta latest_only = False def __init__(self, metadata, output_dir=None, strict_validation=None, fastmode=False, **kwargs): """ Create a new Scraper instance. :param metadata: metadata for this scraper :param output_dir: the data directory to use :param strict_validation: exit immediately if validation fails """ # configure underlying scrapelib object kwargs['cache_obj'] = scrapelib.FileCache(settings.BILLY_CACHE_DIR) kwargs['requests_per_minute'] = settings.SCRAPELIB_RPM kwargs['timeout'] = settings.SCRAPELIB_TIMEOUT kwargs['retry_attempts'] = settings.SCRAPELIB_RETRY_ATTEMPTS kwargs['retry_wait_seconds'] = settings.SCRAPELIB_RETRY_WAIT_SECONDS if fastmode: kwargs['requests_per_minute'] = 0 kwargs['cache_write_only'] = False super(Scraper, self).__init__(**kwargs) for f in settings.BILLY_LEVEL_FIELDS[self.level]: if not hasattr(self, f): raise Exception('%s scrapers must have a %s attribute' % ( self.level, f)) self.metadata = metadata self.output_dir = output_dir # make output_dir os.path.isdir(self.output_dir) or os.path.makedirs(self.output_dir) # validation self.strict_validation = strict_validation self.validator = DatetimeValidator() self.follow_robots = False # logging convenience methods self.logger = logging.getLogger("billy") self.log = self.logger.info self.debug = self.logger.debug self.warning = self.logger.warning def validate_json(self, obj): if not hasattr(self, '_schema'): self._schema = self._get_schema() try: self.validator.validate(obj, self._schema) except ValueError as ve: self.warning(str(ve)) if self.strict_validation: raise ve def all_sessions(self): sessions = [] for t in self.metadata['terms']: sessions.extend(t['sessions']) return sessions def validate_session(self, session): """ Check that a session is present in the metadata dictionary. raises :exc:`~billy.scrape.NoDataForPeriod` if session is invalid :param session: string representing session to check """ for t in self.metadata['terms']: if session in t['sessions']: return True raise NoDataForPeriod(session) def validate_term(self, term, latest_only=False): """ Check that a term is present in the metadata dictionary. raises :exc:`~billy.scrape.NoDataForPeriod` if term is invalid :param term: string representing term to check :param latest_only: if True, will raise exception if term is not the current term (default: False) """ if latest_only: if term == self.metadata['terms'][-1]['name']: return True else: raise NoDataForPeriod(term) for t in self.metadata['terms']: if term == t['name']: return True raise NoDataForPeriod(term) def save_object(self, obj): # copy over level information obj['level'] = self.level for f in settings.BILLY_LEVEL_FIELDS[self.level]: obj[f] = getattr(self, f) filename = obj.get_filename() with open(os.path.join(self.output_dir, self.scraper_type, filename), 'w') as f: json.dump(obj, f, cls=JSONDateEncoder) # validate after writing, allows for inspection self.validate_json(obj)