class Scraper(scrapelib.Scraper): def __init__(self, metadata, no_cache=False, output_dir=None, strict_validation=None, **kwargs): """ Create a new Scraper instance. :param metadata: metadata for this state :param no_cache: if True, will ignore any cached downloads :param output_dir: the Fifty State data directory to use :param strict_validation: exit immediately if validation fails """ # configure underlying scrapelib object if no_cache: kwargs['cache_dir'] = None elif 'cache_dir' not in kwargs: kwargs['cache_dir'] = getattr(settings, 'FIFTYSTATES_CACHE_DIR', None) if 'error_dir' not in kwargs: kwargs['error_dir'] = getattr(settings, 'FIFTYSTATES_ERROR_DIR', None) if 'timeout' not in kwargs: kwargs['timeout'] = getattr(settings, 'SCRAPELIB_TIMEOUT', 600) if 'requests_per_minute' not in kwargs: kwargs['requests_per_minute'] = None super(Scraper, self).__init__(**kwargs) if not hasattr(self, 'state'): raise Exception('Scrapers must have a state attribute') self.metadata = metadata self.output_dir = output_dir # validation self.strict_validation = strict_validation self.validator = DatetimeValidator() self.follow_robots = False # logging convenience methods self.logger = logging.getLogger("fiftystates") self.log = self.logger.info self.debug = self.logger.debug self.warning = self.logger.warning def validate_json(self, obj): if not hasattr(self, '_schema'): self._schema = self._get_schema() try: self.validator.validate(obj, self._schema) except ValueError, ve: self.warning(str(ve)) if self.strict_validation: raise ve
def __init__(self, metadata, no_cache=False, output_dir=None, strict_validation=None, **kwargs): """ Create a new Scraper instance. :param metadata: metadata for this state :param no_cache: if True, will ignore any cached downloads :param output_dir: the Fifty State data directory to use :param strict_validation: exit immediately if validation fails """ # configure underlying scrapelib object if no_cache: kwargs['cache_dir'] = None elif 'cache_dir' not in kwargs: kwargs['cache_dir'] = getattr(settings, 'FIFTYSTATES_CACHE_DIR', None) if 'error_dir' not in kwargs: kwargs['error_dir'] = getattr(settings, 'FIFTYSTATES_ERROR_DIR', None) if 'timeout' not in kwargs: kwargs['timeout'] = getattr(settings, 'SCRAPELIB_TIMEOUT', 600) if 'requests_per_minute' not in kwargs: kwargs['requests_per_minute'] = None if 'retry_attempts' not in kwargs: kwargs['retry_attempts'] = getattr(settings, 'SCRAPELIB_RETRY_ATTEMPTS', 3) if 'retry_wait_seconds' not in kwargs: kwargs['retry_wait_seconds'] = getattr(settings, 'SCRAPELIB_RETRY_WAIT_SECONDS', 10) super(Scraper, self).__init__(**kwargs) if not hasattr(self, 'state'): raise Exception('Scrapers must have a state attribute') self.metadata = metadata self.output_dir = output_dir # validation self.strict_validation = strict_validation self.validator = DatetimeValidator() self.follow_robots = False # logging convenience methods self.logger = logging.getLogger("fiftystates") self.log = self.logger.info self.debug = self.logger.debug self.warning = self.logger.warning
) # make output dir if it doesn't exist output_dir = options.output_dir or os.path.join("data", state) try: os.makedirs(output_dir) except OSError, e: if e.errno != 17: raise e # write metadata try: schema_path = os.path.join(os.path.split(__file__)[0], "../../schemas/metadata.json") schema = json.load(open(schema_path)) validator = DatetimeValidator() validator.validate(metadata, schema) except ValueError, e: logging.getLogger("fiftystates").warning("metadata validation error: " + str(e)) with open(os.path.join(output_dir, "state_metadata.json"), "w") as f: json.dump(metadata, f, cls=JSONDateEncoder) # determine years years = options.years # determine sessions sessions = options.sessions terms = options.terms if terms: for term in metadata["terms"]: