def _get_configured_scraper(scraper_type, options, metadata): try: ScraperClass = get_scraper(options.module, scraper_type) except ScrapeError as e: # silence error only when alldata is present if 'alldata' in options.types and ('no %s scraper found in' % scraper_type) in str(e): return None else: raise e return ScraperClass(metadata, output_dir=options.output_dir, strict_validation=options.strict, fastmode=options.fastmode)
def _run_scraper(options, metadata): _clear_scraped_data(options.output_dir, 'bills') ScraperClass = get_scraper(mod_path, 'bills') opts = {'output_dir': options.output_dir, 'no_cache': options.no_cache, 'requests_per_minute': options.rpm, 'strict_validation': options.strict, 'retry_attempts': settings.SCRAPELIB_RETRY_ATTEMPTS, 'retry_wait_seconds': settings.SCRAPELIB_RETRY_WAIT_SECONDS, } if options.fastmode: opts['requests_per_minute'] = 0 opts['use_cache_first'] = True scraper = ScraperClass(metadata, **opts) print options.session, options.bill_id scraper.scrape_bill(options.chamber, options.session, options.bill_id)
def _run_scraper(options, metadata): _clear_scraped_data(options.output_dir, 'bills') ScraperClass = get_scraper(mod_path, 'bills') opts = { 'output_dir': options.output_dir, 'no_cache': options.no_cache, 'requests_per_minute': options.rpm, 'strict_validation': options.strict, 'retry_attempts': settings.SCRAPELIB_RETRY_ATTEMPTS, 'retry_wait_seconds': settings.SCRAPELIB_RETRY_WAIT_SECONDS, } if options.fastmode: opts['requests_per_minute'] = 0 opts['use_cache_first'] = True scraper = ScraperClass(metadata, **opts) print options.session, options.bill_id scraper.scrape_bill(options.chamber, options.session, options.bill_id)
def _run_scraper(options, metadata): _clear_scraped_data(options.output_dir, "bills") ScraperClass = get_scraper(mod_path, "bills") opts = { "output_dir": options.output_dir, "no_cache": options.no_cache, "requests_per_minute": options.rpm, "strict_validation": options.strict, "retry_attempts": settings.SCRAPELIB_RETRY_ATTEMPTS, "retry_wait_seconds": settings.SCRAPELIB_RETRY_WAIT_SECONDS, } if options.fastmode: opts["requests_per_minute"] = 0 opts["use_cache_first"] = True scraper = ScraperClass(metadata, **opts) print options.session, options.bill_id scraper.scrape_bill(options.chamber, options.session, options.bill_id)
def _run_scraper(mod_path, state, scraper_type, options, metadata): """ state: lower case two letter abbreviation of state scraper_type: bills, legislators, committees, votes """ _clear_scraped_data(options.output_dir, scraper_type) try: ScraperClass = get_scraper(mod_path, state, scraper_type) except ScrapeError as e: # only re-raise if not alldata if not options.alldata: raise e else: return opts = {'output_dir': options.output_dir, 'no_cache': options.no_cache, 'requests_per_minute': options.rpm, 'timeout': options.timeout, 'strict_validation': options.strict, 'retry_attempts': settings.SCRAPELIB_RETRY_ATTEMPTS, 'retry_wait_seconds': settings.SCRAPELIB_RETRY_WAIT_SECONDS, } if options.fastmode: opts['requests_per_minute'] = 0 opts['use_cache_first'] = True scraper = ScraperClass(metadata, **opts) # times: the list to iterate over for second scrape param if scraper_type in ('bills', 'votes', 'events'): if not options.sessions: if options.terms: times = [] for term in options.terms: scraper.validate_term(term) for metaterm in metadata['terms']: if term == metaterm['name']: times.extend(metaterm['sessions']) else: latest_session = metadata['terms'][-1]['sessions'][-1] print('No session specified, using latest "%s"' % latest_session) times = [latest_session] else: times = options.sessions # validate sessions for time in times: scraper.validate_session(time) elif scraper_type in ('legislators', 'committees'): if not options.terms: latest_term = metadata['terms'][-1]['name'] print 'No term specified, using latest "%s"' % latest_term times = [latest_term] else: times = options.terms # validate terms for time in times: scraper.validate_term(time) # run scraper against year/session/term for time in times: for chamber in options.chambers: scraper.scrape(chamber, time) if scraper_type == 'events' and len(options.chambers) == 2: scraper.scrape('other', time)