Esempio n. 1
0
File: base.py Progetto: rshorey/pupa
    def __init__(self, jurisdiction, datadir, *, strict_validation=True, fastmode=False):
        super(Scraper, self).__init__()

        # set options
        self.jurisdiction = jurisdiction
        self.datadir = datadir

        # scrapelib setup
        self.timeout = settings.SCRAPELIB_TIMEOUT
        self.requests_per_minute = settings.SCRAPELIB_RPM
        self.retry_attempts = settings.SCRAPELIB_RETRY_ATTEMPTS
        self.retry_wait_seconds = settings.SCRAPELIB_RETRY_WAIT_SECONDS
        self.follow_robots = False

        # caching
        if settings.CACHE_DIR:
            self.cache_storage = scrapelib.FileCache(settings.CACHE_DIR)

        if fastmode:
            self.requests_per_minute = 0
            self.cache_write_only = False

        # validation
        self.strict_validation = strict_validation

        # 'type' -> {set of names}
        self.output_names = defaultdict(set)

        # logging convenience methods
        self.logger = logging.getLogger("pupa")
        self.info = self.logger.info
        self.debug = self.logger.debug
        self.warning = self.logger.warning
        self.error = self.logger.error
        self.critical = self.logger.critical
Esempio n. 2
0
    def run(self):
        # We first will ensure the cache and data directories exist
        if not os.path.exists(CACHE_DIR):
            os.makedirs(CACHE_DIR)
        if not os.path.exists(DATA_DIR):
            os.makedirs(DATA_DIR)

        # We use scrapelib as we are unsure of the integrity of the server we will be pulling from
        s = scrapelib.Scraper(retry_wait_seconds=5, retry_attempts=10)

        # Enable caching so we don't repeat downloads
        s.cache_storage = scrapelib.FileCache(CACHE_DIR)
        s.cache_write_only = False

        # Simple download function
        def download_entity(s, filename):
            """ Download an asset """
            logging.info('Downloading %s from %s' %
                         (filename, join(SOURCE_URL, filename)))
            s.urlretrieve('%s/%s' % (SOURCE_URL, filename),
                          '%s/%s' % (self.output().path, filename))

        # Download the data!
        os.system('mkdir -p "%s"' % self.output().path)
        for filename in CANDIDATE_SOURCE_FILES.values():
            download_entity(s, filename)

        for filename in COMMITTEE_SOURCE_FILES.values():
            download_entity(s, filename)

        for filename in META_SOURCE_FILES.values():
            download_entity(s, filename)
Esempio n. 3
0
    def __init__(self,
                 metadata,
                 output_dir=None,
                 strict_validation=None,
                 fastmode=False):
        """
        Create a new Scraper instance.

        :param metadata: metadata for this scraper
        :param output_dir: the data directory to use
        :param strict_validation: exit immediately if validation fails
        """
        super(Scraper, self).__init__()

        # scrapelib overrides
        self.timeout = settings.SCRAPELIB_TIMEOUT
        self.cache_storage = scrapelib.FileCache(settings.BILLY_CACHE_DIR)
        self.requests_per_minute = settings.SCRAPELIB_RPM
        self.retry_attempts = settings.SCRAPELIB_RETRY_ATTEMPTS
        self.retry_wait_seconds = settings.SCRAPELIB_RETRY_WAIT_SECONDS

        if fastmode:
            self.requests_per_minute = 0
            self.cache_write_only = False

        # if scraper uses dryscrape, set up session
        if settings.USES_DRYSCRAPE:
            dryscrape.start_xvfb()
            self.session = dryscrape.Session()

        self.metadata = metadata
        self.output_dir = output_dir
        self.output_names = set()

        # make output_dir
        os.path.isdir(self.output_dir) or os.path.makedirs(self.output_dir)

        # validation
        self.strict_validation = strict_validation
        self.validator = DatetimeValidator()
        self._schema = {}
        self._load_schemas()

        # logging convenience methods
        self.logger = logging.getLogger("billy")
        self.log = self.logger.info
        self.info = self.logger.info
        self.debug = self.logger.debug
        self.warning = self.logger.warning
        self.error = self.logger.error
        self.critical = self.logger.critical
Esempio n. 4
0
    def __init__(self,
                 metadata,
                 output_dir=None,
                 strict_validation=None,
                 fastmode=False,
                 **kwargs):
        """
        Create a new Scraper instance.

        :param metadata: metadata for this scraper
        :param output_dir: the data directory to use
        :param strict_validation: exit immediately if validation fails
        """

        # configure underlying scrapelib object
        kwargs['cache_obj'] = scrapelib.FileCache(settings.BILLY_CACHE_DIR)
        kwargs['requests_per_minute'] = settings.SCRAPELIB_RPM
        kwargs['timeout'] = settings.SCRAPELIB_TIMEOUT
        kwargs['retry_attempts'] = settings.SCRAPELIB_RETRY_ATTEMPTS
        kwargs['retry_wait_seconds'] = settings.SCRAPELIB_RETRY_WAIT_SECONDS

        if fastmode:
            kwargs['requests_per_minute'] = 0
            kwargs['cache_write_only'] = False

        super(Scraper, self).__init__(**kwargs)

        self.metadata = metadata
        self.output_dir = output_dir
        self.output_names = set()

        # make output_dir
        os.path.isdir(self.output_dir) or os.path.makedirs(self.output_dir)

        # validation
        self.strict_validation = strict_validation
        self.validator = DatetimeValidator()
        self._schema = {}
        self._load_schemas()

        self.follow_robots = False

        # logging convenience methods
        self.logger = logging.getLogger("billy")
        self.log = self.logger.info
        self.info = self.logger.info
        self.debug = self.logger.debug
        self.warning = self.logger.warning
        self.error = self.logger.error
        self.critical = self.logger.critical
Esempio n. 5
0
    def download_files(self):
        s = scrapelib.Scraper(requests_per_minute=self.req_per_min,
                              retry_attempts=2)

        #enable cache on scrapelib
        cache_dir = os.path.join(os.getcwd(), 'cache')
        if not cache_dir:
            os.mkdir(cache_dir)

        s.cache_storage = scrapelib.FileCache(cache_dir)
        s.cache_write_only = False
        # TODO : update scrapelib to check last modified header

        with closing(
                shelve.open(os.path.join(self.import_dir,
                                         self.shelf_file))) as db:
            for key in db.keys():
                dir_for_solnbr = self.create_dir_by_solnbr(key)

                attachments = db[key]['attachments']

                for (i, a) in enumerate(attachments):
                    self.log.info("Downloading file ({}: {}) from {}".format(
                        a['filename'], a['desc'], a['url']))

                    # parse URL into components
                    u = urlparse(a['url'])

                    # match main portion to dict of special cases, get function to use
                    downloader_func = downloaders.func_map.get(
                        u.netloc, downloaders.default)

                    try:
                        local_file_path = downloader_func(s,
                                                          a['url'],
                                                          dir_for_solnbr,
                                                          solnbr=key)
                        a.update({'local_file_path': local_file_path})
                    except:
                        self.log.exception(
                            "Attachment couldn't be retrieved for unknown reasons. URL: {} Continuing."
                            .format(a['url']))
                        a.update({'exception': 1})
                        continue
                    finally:
                        attachments[i] = a

                meta = {'dl_complete': True, 'num_dl': len(attachments)}
                db[key] = {'attachments': attachments, 'meta': meta}
Esempio n. 6
0
    def __init__(self,
                 jurisdiction,
                 datadir,
                 *,
                 strict_validation=True,
                 fastmode=False):
        super(Scraper, self).__init__()

        # set options
        self.jurisdiction = jurisdiction
        self.datadir = datadir

        # scrapelib setup
        self.timeout = settings.SCRAPELIB_TIMEOUT
        self.requests_per_minute = settings.SCRAPELIB_RPM
        self.retry_attempts = settings.SCRAPELIB_RETRY_ATTEMPTS
        self.retry_wait_seconds = settings.SCRAPELIB_RETRY_WAIT_SECONDS
        self.verify = settings.SCRAPELIB_VERIFY

        # caching
        if settings.CACHE_DIR:
            self.cache_storage = scrapelib.FileCache(settings.CACHE_DIR)

        if fastmode:
            self.requests_per_minute = 0
            self.cache_write_only = False

        # validation
        self.strict_validation = strict_validation

        # 'type' -> {set of names}
        self.output_names = defaultdict(set)

        # logging convenience methods
        self.logger = logging.getLogger("openstates")
        self.info = self.logger.info
        self.debug = self.logger.debug
        self.warning = self.logger.warning
        self.error = self.logger.error
        self.critical = self.logger.critical

        modname = os.environ.get("SCRAPE_OUTPUT_HANDLER")
        if modname is None:
            self.scrape_output_handler = None
        else:
            handler = importlib.import_module(modname)
            self.scrape_output_handler = handler.Handler(self)
def main(abbr):

    request_defaults = {
        # 'proxies': {"http": "localhost:8888"},
        'timeout': 5.0,
        'headers': {
            'Accept': ('text/html,application/xhtml+xml,application/'
                       'xml;q=0.9,*/*;q=0.8'),
            'Accept-Encoding':
            'gzip, deflate',
            'Accept-Language':
            'en-us,en;q=0.5',
            'User-Agent': ('Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:10.0.2) '
                           'Gecko/20100101 Firefox/10.0.2'),
        },
        'follow_robots': False,

        # Note, this script needs run in the same dir as billy_settings.py
    }

    logger = logbook.Logger()
    DATA = join(settings.BILLY_DATA_DIR, abbr, 'billtext')

    try:
        os.makedirs(DATA)
    except OSError:
        pass
    logger.info('writing files to %r' % DATA)

    session = scrapelib.Scraper(cache_obj=scrapelib.FileCache('cache'),
                                cache_write_only=False,
                                use_cache_first=True,
                                requests_per_minute=0,
                                **request_defaults)

    for bill in db.bills.find({'state': abbr}):
        if len(bill['versions']):
            bill_id = bill['bill_id']
            url = bill['versions'][0]['url']
            logger.info('trying %r: %r' % (bill_id, url))
            text = session.get(url).text
            with open(join(DATA, bill['_id']), 'w') as f:
                f.write(text.encode('utf-8'))
Esempio n. 8
0
    def __init__(self):
        super().__init__()
        self.checked_places = set()
        logging.config.dictConfig(self.LOGGING_CONFIG)
        self.logger = logging.getLogger('legistar')

        # scrapelib setup
        self.timeout = self.SCRAPELIB_TIMEOUT
        self.requests_per_minute = self.SCRAPELIB_RPM
        self.retry_attempts = self.SCRAPELIB_RETRY_ATTEMPTS
        self.retry_wait_seconds = self.SCRAPELIB_RETRY_WAIT_SECONDS
        self.follow_robots = False

        # if self.PROXIES:
        #     self.proxies = self.PROXIES

        if self.FASTMODE:
            self.cache_write_only = False

        cache_dir = '.cache'
        self.cache_storage = scrapelib.FileCache(cache_dir)