def __init__(self, jurisdiction, datadir, *, strict_validation=True, fastmode=False): super(Scraper, self).__init__() # set options self.jurisdiction = jurisdiction self.datadir = datadir # scrapelib setup self.timeout = settings.SCRAPELIB_TIMEOUT self.requests_per_minute = settings.SCRAPELIB_RPM self.retry_attempts = settings.SCRAPELIB_RETRY_ATTEMPTS self.retry_wait_seconds = settings.SCRAPELIB_RETRY_WAIT_SECONDS self.follow_robots = False # caching if settings.CACHE_DIR: self.cache_storage = scrapelib.FileCache(settings.CACHE_DIR) if fastmode: self.requests_per_minute = 0 self.cache_write_only = False # validation self.strict_validation = strict_validation # 'type' -> {set of names} self.output_names = defaultdict(set) # logging convenience methods self.logger = logging.getLogger("pupa") self.info = self.logger.info self.debug = self.logger.debug self.warning = self.logger.warning self.error = self.logger.error self.critical = self.logger.critical
def run(self): # We first will ensure the cache and data directories exist if not os.path.exists(CACHE_DIR): os.makedirs(CACHE_DIR) if not os.path.exists(DATA_DIR): os.makedirs(DATA_DIR) # We use scrapelib as we are unsure of the integrity of the server we will be pulling from s = scrapelib.Scraper(retry_wait_seconds=5, retry_attempts=10) # Enable caching so we don't repeat downloads s.cache_storage = scrapelib.FileCache(CACHE_DIR) s.cache_write_only = False # Simple download function def download_entity(s, filename): """ Download an asset """ logging.info('Downloading %s from %s' % (filename, join(SOURCE_URL, filename))) s.urlretrieve('%s/%s' % (SOURCE_URL, filename), '%s/%s' % (self.output().path, filename)) # Download the data! os.system('mkdir -p "%s"' % self.output().path) for filename in CANDIDATE_SOURCE_FILES.values(): download_entity(s, filename) for filename in COMMITTEE_SOURCE_FILES.values(): download_entity(s, filename) for filename in META_SOURCE_FILES.values(): download_entity(s, filename)
def __init__(self, metadata, output_dir=None, strict_validation=None, fastmode=False): """ Create a new Scraper instance. :param metadata: metadata for this scraper :param output_dir: the data directory to use :param strict_validation: exit immediately if validation fails """ super(Scraper, self).__init__() # scrapelib overrides self.timeout = settings.SCRAPELIB_TIMEOUT self.cache_storage = scrapelib.FileCache(settings.BILLY_CACHE_DIR) self.requests_per_minute = settings.SCRAPELIB_RPM self.retry_attempts = settings.SCRAPELIB_RETRY_ATTEMPTS self.retry_wait_seconds = settings.SCRAPELIB_RETRY_WAIT_SECONDS if fastmode: self.requests_per_minute = 0 self.cache_write_only = False # if scraper uses dryscrape, set up session if settings.USES_DRYSCRAPE: dryscrape.start_xvfb() self.session = dryscrape.Session() self.metadata = metadata self.output_dir = output_dir self.output_names = set() # make output_dir os.path.isdir(self.output_dir) or os.path.makedirs(self.output_dir) # validation self.strict_validation = strict_validation self.validator = DatetimeValidator() self._schema = {} self._load_schemas() # logging convenience methods self.logger = logging.getLogger("billy") self.log = self.logger.info self.info = self.logger.info self.debug = self.logger.debug self.warning = self.logger.warning self.error = self.logger.error self.critical = self.logger.critical
def __init__(self, metadata, output_dir=None, strict_validation=None, fastmode=False, **kwargs): """ Create a new Scraper instance. :param metadata: metadata for this scraper :param output_dir: the data directory to use :param strict_validation: exit immediately if validation fails """ # configure underlying scrapelib object kwargs['cache_obj'] = scrapelib.FileCache(settings.BILLY_CACHE_DIR) kwargs['requests_per_minute'] = settings.SCRAPELIB_RPM kwargs['timeout'] = settings.SCRAPELIB_TIMEOUT kwargs['retry_attempts'] = settings.SCRAPELIB_RETRY_ATTEMPTS kwargs['retry_wait_seconds'] = settings.SCRAPELIB_RETRY_WAIT_SECONDS if fastmode: kwargs['requests_per_minute'] = 0 kwargs['cache_write_only'] = False super(Scraper, self).__init__(**kwargs) self.metadata = metadata self.output_dir = output_dir self.output_names = set() # make output_dir os.path.isdir(self.output_dir) or os.path.makedirs(self.output_dir) # validation self.strict_validation = strict_validation self.validator = DatetimeValidator() self._schema = {} self._load_schemas() self.follow_robots = False # logging convenience methods self.logger = logging.getLogger("billy") self.log = self.logger.info self.info = self.logger.info self.debug = self.logger.debug self.warning = self.logger.warning self.error = self.logger.error self.critical = self.logger.critical
def download_files(self): s = scrapelib.Scraper(requests_per_minute=self.req_per_min, retry_attempts=2) #enable cache on scrapelib cache_dir = os.path.join(os.getcwd(), 'cache') if not cache_dir: os.mkdir(cache_dir) s.cache_storage = scrapelib.FileCache(cache_dir) s.cache_write_only = False # TODO : update scrapelib to check last modified header with closing( shelve.open(os.path.join(self.import_dir, self.shelf_file))) as db: for key in db.keys(): dir_for_solnbr = self.create_dir_by_solnbr(key) attachments = db[key]['attachments'] for (i, a) in enumerate(attachments): self.log.info("Downloading file ({}: {}) from {}".format( a['filename'], a['desc'], a['url'])) # parse URL into components u = urlparse(a['url']) # match main portion to dict of special cases, get function to use downloader_func = downloaders.func_map.get( u.netloc, downloaders.default) try: local_file_path = downloader_func(s, a['url'], dir_for_solnbr, solnbr=key) a.update({'local_file_path': local_file_path}) except: self.log.exception( "Attachment couldn't be retrieved for unknown reasons. URL: {} Continuing." .format(a['url'])) a.update({'exception': 1}) continue finally: attachments[i] = a meta = {'dl_complete': True, 'num_dl': len(attachments)} db[key] = {'attachments': attachments, 'meta': meta}
def __init__(self, jurisdiction, datadir, *, strict_validation=True, fastmode=False): super(Scraper, self).__init__() # set options self.jurisdiction = jurisdiction self.datadir = datadir # scrapelib setup self.timeout = settings.SCRAPELIB_TIMEOUT self.requests_per_minute = settings.SCRAPELIB_RPM self.retry_attempts = settings.SCRAPELIB_RETRY_ATTEMPTS self.retry_wait_seconds = settings.SCRAPELIB_RETRY_WAIT_SECONDS self.verify = settings.SCRAPELIB_VERIFY # caching if settings.CACHE_DIR: self.cache_storage = scrapelib.FileCache(settings.CACHE_DIR) if fastmode: self.requests_per_minute = 0 self.cache_write_only = False # validation self.strict_validation = strict_validation # 'type' -> {set of names} self.output_names = defaultdict(set) # logging convenience methods self.logger = logging.getLogger("openstates") self.info = self.logger.info self.debug = self.logger.debug self.warning = self.logger.warning self.error = self.logger.error self.critical = self.logger.critical modname = os.environ.get("SCRAPE_OUTPUT_HANDLER") if modname is None: self.scrape_output_handler = None else: handler = importlib.import_module(modname) self.scrape_output_handler = handler.Handler(self)
def main(abbr): request_defaults = { # 'proxies': {"http": "localhost:8888"}, 'timeout': 5.0, 'headers': { 'Accept': ('text/html,application/xhtml+xml,application/' 'xml;q=0.9,*/*;q=0.8'), 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'en-us,en;q=0.5', 'User-Agent': ('Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:10.0.2) ' 'Gecko/20100101 Firefox/10.0.2'), }, 'follow_robots': False, # Note, this script needs run in the same dir as billy_settings.py } logger = logbook.Logger() DATA = join(settings.BILLY_DATA_DIR, abbr, 'billtext') try: os.makedirs(DATA) except OSError: pass logger.info('writing files to %r' % DATA) session = scrapelib.Scraper(cache_obj=scrapelib.FileCache('cache'), cache_write_only=False, use_cache_first=True, requests_per_minute=0, **request_defaults) for bill in db.bills.find({'state': abbr}): if len(bill['versions']): bill_id = bill['bill_id'] url = bill['versions'][0]['url'] logger.info('trying %r: %r' % (bill_id, url)) text = session.get(url).text with open(join(DATA, bill['_id']), 'w') as f: f.write(text.encode('utf-8'))
def __init__(self): super().__init__() self.checked_places = set() logging.config.dictConfig(self.LOGGING_CONFIG) self.logger = logging.getLogger('legistar') # scrapelib setup self.timeout = self.SCRAPELIB_TIMEOUT self.requests_per_minute = self.SCRAPELIB_RPM self.retry_attempts = self.SCRAPELIB_RETRY_ATTEMPTS self.retry_wait_seconds = self.SCRAPELIB_RETRY_WAIT_SECONDS self.follow_robots = False # if self.PROXIES: # self.proxies = self.PROXIES if self.FASTMODE: self.cache_write_only = False cache_dir = '.cache' self.cache_storage = scrapelib.FileCache(cache_dir)