Example #1
0
class Scraper(scrapelib.Scraper):

    def __init__(self, metadata, no_cache=False, output_dir=None,
                 strict_validation=None, **kwargs):
        """
        Create a new Scraper instance.

        :param metadata: metadata for this state
        :param no_cache: if True, will ignore any cached downloads
        :param output_dir: the Fifty State data directory to use
        :param strict_validation: exit immediately if validation fails
        """

        # configure underlying scrapelib object
        if no_cache:
            kwargs['cache_dir'] = None
        elif 'cache_dir' not in kwargs:
            kwargs['cache_dir'] = getattr(settings, 'FIFTYSTATES_CACHE_DIR',
                                          None)

        if 'error_dir' not in kwargs:
            kwargs['error_dir'] = getattr(settings, 'FIFTYSTATES_ERROR_DIR',
                                          None)

        if 'timeout' not in kwargs:
            kwargs['timeout'] = getattr(settings, 'SCRAPELIB_TIMEOUT',
                                        600)

        if 'requests_per_minute' not in kwargs:
            kwargs['requests_per_minute'] = None

        super(Scraper, self).__init__(**kwargs)

        if not hasattr(self, 'state'):
            raise Exception('Scrapers must have a state attribute')

        self.metadata = metadata
        self.output_dir = output_dir

        # validation
        self.strict_validation = strict_validation
        self.validator = DatetimeValidator()

        self.follow_robots = False

        # logging convenience methods
        self.logger = logging.getLogger("fiftystates")
        self.log = self.logger.info
        self.debug = self.logger.debug
        self.warning = self.logger.warning

    def validate_json(self, obj):
        if not hasattr(self, '_schema'):
            self._schema = self._get_schema()
        try:
            self.validator.validate(obj, self._schema)
        except ValueError, ve:
            self.warning(str(ve))
            if self.strict_validation:
                raise ve
Example #2
0
    def __init__(self, metadata, no_cache=False, output_dir=None,
                 strict_validation=None, **kwargs):
        """
        Create a new Scraper instance.

        :param metadata: metadata for this state
        :param no_cache: if True, will ignore any cached downloads
        :param output_dir: the Fifty State data directory to use
        :param strict_validation: exit immediately if validation fails
        """

        # configure underlying scrapelib object
        if no_cache:
            kwargs['cache_dir'] = None
        elif 'cache_dir' not in kwargs:
            kwargs['cache_dir'] = getattr(settings, 'FIFTYSTATES_CACHE_DIR',
                                          None)

        if 'error_dir' not in kwargs:
            kwargs['error_dir'] = getattr(settings, 'FIFTYSTATES_ERROR_DIR',
                                          None)

        if 'timeout' not in kwargs:
            kwargs['timeout'] = getattr(settings, 'SCRAPELIB_TIMEOUT',
                                        600)

        if 'requests_per_minute' not in kwargs:
            kwargs['requests_per_minute'] = None

        if 'retry_attempts' not in kwargs:
            kwargs['retry_attempts'] = getattr(settings,
                                               'SCRAPELIB_RETRY_ATTEMPTS',
                                               3)

        if 'retry_wait_seconds' not in kwargs:
            kwargs['retry_wait_seconds'] = getattr(settings,
                                               'SCRAPELIB_RETRY_WAIT_SECONDS',
                                                10)

        super(Scraper, self).__init__(**kwargs)

        if not hasattr(self, 'state'):
            raise Exception('Scrapers must have a state attribute')

        self.metadata = metadata
        self.output_dir = output_dir

        # validation
        self.strict_validation = strict_validation
        self.validator = DatetimeValidator()

        self.follow_robots = False

        # logging convenience methods
        self.logger = logging.getLogger("fiftystates")
        self.log = self.logger.info
        self.debug = self.logger.debug
        self.warning = self.logger.warning
Example #3
0
    )

    # make output dir if it doesn't exist
    output_dir = options.output_dir or os.path.join("data", state)
    try:
        os.makedirs(output_dir)
    except OSError, e:
        if e.errno != 17:
            raise e

    # write metadata
    try:
        schema_path = os.path.join(os.path.split(__file__)[0], "../../schemas/metadata.json")
        schema = json.load(open(schema_path))

        validator = DatetimeValidator()
        validator.validate(metadata, schema)
    except ValueError, e:
        logging.getLogger("fiftystates").warning("metadata validation error: " + str(e))

    with open(os.path.join(output_dir, "state_metadata.json"), "w") as f:
        json.dump(metadata, f, cls=JSONDateEncoder)

    # determine years
    years = options.years

    # determine sessions
    sessions = options.sessions
    terms = options.terms
    if terms:
        for term in metadata["terms"]: