def newfunc( header: typing.List[str], retries: int, retry_wait: int, rpm: int, timeout: int, user_agent: str, verbosity: int, verify: bool, fastmode: bool, **kwargs: str, ) -> None: scraper = Scraper( requests_per_minute=rpm, retry_attempts=retries, retry_wait_seconds=retry_wait, verify=verify, ) scraper.timeout = timeout scraper.user_agent = user_agent # only update headers, don't overwrite defaults scraper.headers.update( {k.strip(): v.strip() for k, v in [h.split(":") for h in header]}) if fastmode: scraper.cache_storage = SQLiteCache("spatula-cache.db") scraper.cache_write_only = False if verbosity == -1: level = logging.INFO if func.__name__ != "test" else logging.DEBUG elif verbosity == 0: # pragma: no cover level = logging.ERROR elif verbosity == 1: # pragma: no cover level = logging.INFO elif verbosity >= 2: # pragma: no cover level = logging.DEBUG if verbosity < 3: # replace parent library logging logging.getLogger("scrapelib").setLevel(logging.ERROR) logging.getLogger("urllib3").setLevel(logging.WARNING) logging.basicConfig(level=level) return func(**kwargs, scraper=scraper)
('affordable care act', 'obamacare', 'healthcare', 'health care', 'insurance'), ('ukraine', 'ukrainian', 'crimea'), ('unemployed', 'unemployment')] #('palestine', 'palestinians'), #('israel', 'israeli', 'palestine', 'palestinians'), #('iraq', 'iraqis', 'iraqs'), #('executive order', 'executive action'), #('economy', 'economic'), ROOT_URL = 'http://www.whitehouse.gov/briefing-room/press-briefings' CSV_PATH = 'briefing_links.csv' s = Scraper(requests_per_minute=60) s.cache_storage = FileCache('press_briefing_cache') s.cache_write_only = False @task(default=True) def update(): """ Stub function for updating app-specific data. """ #update_featured_social() @task def scrape_briefings(): for index in range(0, 22): list = '%s?page=%i' % (ROOT_URL, index) print 'parsing %s' % list
#!/usr/bin/env python import unicodecsv import statestyle import re from lxml.html import fromstring from urlparse import urljoin # I copied this code from scrape_states.py from scrapelib import Scraper, FileCache s = Scraper(requests_per_minute=60, follow_robots=False) s.cache_storage = FileCache('wikipedia_cache') s.cache_write_only = False # My Stuff CD_LIST = 'https://en.wikipedia.org/wiki/List_of_United_States_congressional_districts' NON_VOTING = ['American Samoa', 'District of Columbia', 'Guam', 'Northern Mariana Islands', 'Puerto Rico', 'United States Virgin Islands'] NOT_STATES = ['Philippines', 'U.S. Virgin Islands'] def parse_cd_file(): writer = unicodecsv.writer(open('cd_wiki_data.csv', 'w')) writer.writerow(['full_geoid', 'wiki_url']) response = s.urlopen(CD_LIST) doc = fromstring(response) for h2 in doc.findall('.//h2')[2:59]: for span in h2.find_class('mw-headline'): if span.text_content() in NOT_STATES: break