Example #1
0
def _get_configured_scraper(scraper_type, options, metadata):
    try:
        ScraperClass = get_scraper(options.module, scraper_type)
    except ScrapeError as e:
        # silence error only when alldata is present
        if 'alldata' in options.types and ('no %s scraper found in' % scraper_type) in str(e):
            return None
        else:
            raise e

    return ScraperClass(metadata,
                        output_dir=options.output_dir,
                        strict_validation=options.strict,
                        fastmode=options.fastmode)
Example #2
0
def _get_configured_scraper(scraper_type, options, metadata):
    try:
        ScraperClass = get_scraper(options.module, scraper_type)
    except ScrapeError as e:
        # silence error only when alldata is present
        if 'alldata' in options.types and ('no %s scraper found in' % scraper_type) in str(e):
            return None
        else:
            raise e

    return ScraperClass(metadata,
                        output_dir=options.output_dir,
                        strict_validation=options.strict,
                        fastmode=options.fastmode)
Example #3
0
def _run_scraper(options, metadata):
    _clear_scraped_data(options.output_dir, 'bills')

    ScraperClass = get_scraper(mod_path, 'bills')

    opts = {'output_dir': options.output_dir,
            'no_cache': options.no_cache,
            'requests_per_minute': options.rpm,
            'strict_validation': options.strict,
            'retry_attempts': settings.SCRAPELIB_RETRY_ATTEMPTS,
            'retry_wait_seconds': settings.SCRAPELIB_RETRY_WAIT_SECONDS,
           }
    if options.fastmode:
        opts['requests_per_minute'] = 0
        opts['use_cache_first'] = True
    scraper = ScraperClass(metadata, **opts)

    print options.session, options.bill_id
    scraper.scrape_bill(options.chamber, options.session, options.bill_id)
Example #4
0
def _run_scraper(options, metadata):
    _clear_scraped_data(options.output_dir, 'bills')

    ScraperClass = get_scraper(mod_path, 'bills')

    opts = {
        'output_dir': options.output_dir,
        'no_cache': options.no_cache,
        'requests_per_minute': options.rpm,
        'strict_validation': options.strict,
        'retry_attempts': settings.SCRAPELIB_RETRY_ATTEMPTS,
        'retry_wait_seconds': settings.SCRAPELIB_RETRY_WAIT_SECONDS,
    }
    if options.fastmode:
        opts['requests_per_minute'] = 0
        opts['use_cache_first'] = True
    scraper = ScraperClass(metadata, **opts)

    print options.session, options.bill_id
    scraper.scrape_bill(options.chamber, options.session, options.bill_id)
Example #5
0
def _run_scraper(options, metadata):
    _clear_scraped_data(options.output_dir, "bills")

    ScraperClass = get_scraper(mod_path, "bills")

    opts = {
        "output_dir": options.output_dir,
        "no_cache": options.no_cache,
        "requests_per_minute": options.rpm,
        "strict_validation": options.strict,
        "retry_attempts": settings.SCRAPELIB_RETRY_ATTEMPTS,
        "retry_wait_seconds": settings.SCRAPELIB_RETRY_WAIT_SECONDS,
    }
    if options.fastmode:
        opts["requests_per_minute"] = 0
        opts["use_cache_first"] = True
    scraper = ScraperClass(metadata, **opts)

    print options.session, options.bill_id
    scraper.scrape_bill(options.chamber, options.session, options.bill_id)
Example #6
0
def _run_scraper(mod_path, state, scraper_type, options, metadata):
    """
        state: lower case two letter abbreviation of state
        scraper_type: bills, legislators, committees, votes
    """
    _clear_scraped_data(options.output_dir, scraper_type)

    try:
        ScraperClass = get_scraper(mod_path, state, scraper_type)
    except ScrapeError as e:
        # only re-raise if not alldata
        if not options.alldata:
            raise e
        else:
            return

    opts = {'output_dir': options.output_dir,
            'no_cache': options.no_cache,
            'requests_per_minute': options.rpm,
            'timeout': options.timeout,
            'strict_validation': options.strict,
            'retry_attempts': settings.SCRAPELIB_RETRY_ATTEMPTS,
            'retry_wait_seconds': settings.SCRAPELIB_RETRY_WAIT_SECONDS,
        }
    if options.fastmode:
        opts['requests_per_minute'] = 0
        opts['use_cache_first'] = True
    scraper = ScraperClass(metadata, **opts)

    # times: the list to iterate over for second scrape param
    if scraper_type in ('bills', 'votes', 'events'):
        if not options.sessions:
            if options.terms:
                times = []
                for term in options.terms:
                    scraper.validate_term(term)
                    for metaterm in metadata['terms']:
                        if term == metaterm['name']:
                            times.extend(metaterm['sessions'])
            else:
                latest_session = metadata['terms'][-1]['sessions'][-1]
                print('No session specified, using latest "%s"' %
                      latest_session)
                times = [latest_session]
        else:
            times = options.sessions

        # validate sessions
        for time in times:
            scraper.validate_session(time)
    elif scraper_type in ('legislators', 'committees'):
        if not options.terms:
            latest_term = metadata['terms'][-1]['name']
            print 'No term specified, using latest "%s"' % latest_term
            times = [latest_term]
        else:
            times = options.terms

        # validate terms
        for time in times:
            scraper.validate_term(time)

    # run scraper against year/session/term
    for time in times:
        for chamber in options.chambers:
            scraper.scrape(chamber, time)
        if scraper_type == 'events' and len(options.chambers) == 2:
            scraper.scrape('other', time)