def get_session_details(): """ We will fetch a list of available sessions from the 'bill locator' page. We won't get legislators for all these sessions, but all bills for these sessions are available and we want to be able to get to them. """ scraper = Scraper() nm_locator_url = 'http://legis.state.nm.us/lcs/locator.aspx' with scraper.urlopen(nm_locator_url) as page: page = BeautifulSoup(page) #The first `tr` is simply 'Bill Locator`. Ignoring that data_table = page.find('table', id = 'ctl00_mainCopy_Locators')('tr')[1:] for session in data_table: session_tag = session.find('a') session_name = ' '.join([tag.string.strip() for tag in session_tag('span')]).strip() session_year, sub_session_name = SESSION_NAME_RE.match(session_name).groups() if session_year in metadata['sessions']: if sub_session_name not in metadata['session_details'][session_year]['sub_sessions']: metadata['session_details'][session_year]['sub_sessions'].append(sub_session_name) else: metadata['sessions'].append(session_year) metadata['session_details'][session_year] = dict(years = session_year, sub_sessions = [sub_session_name])
def lxmlize(url, encoding="utf-8", user_agent=requests.utils.default_user_agent()): scraper = Scrapelib(follow_robots=False, requests_per_minute=0) scraper.user_agent = user_agent entry = scraper.urlopen(url) if encoding != "utf-8" or not isinstance(entry, unicode): entry = entry.encode(encoding) page = lxml.html.fromstring(entry) meta = page.xpath('//meta[@http-equiv="refresh"]') if meta: _, url = meta[0].attrib["content"].split("=", 1) return lxmlize(url, encoding) else: page.make_links_absolute(url) return page
def lxmlize(url, encoding='utf-8', user_agent=requests.utils.default_user_agent()): scraper = Scrapelib(follow_robots=False, requests_per_minute=0) scraper.user_agent = user_agent entry = scraper.urlopen(url) if encoding != 'utf-8' or not isinstance(entry, unicode): entry = entry.encode(encoding) page = lxml.html.fromstring(entry) meta = page.xpath('//meta[@http-equiv="refresh"]') if meta: _, url = meta[0].attrib['content'].split('=', 1) return lxmlize(url, encoding) else: page.make_links_absolute(url) return page
def get_response( self, scraper: scrapelib.Scraper) -> Optional[requests.models.Response]: return scraper.request(method=self.method, url=self.url, data=self.data, headers=self.headers)
def import_bills(state, last_updated, cache_dir, data_dir): if last_updated: scraper = Scraper(cache_dir=cache_dir) url = BILL_INDEX + "?%s" query = {'state': state, 'updated_since': last_updated, # YYYY-MM-DD 'apikey': settings.SUNLIGHT_API_KEY} query = urllib.urlencode(query) url = url % query with scraper.urlopen(url) as bill_index: bills = json.loads(bill_index) for b in bills: url = BILL_INDEX + "%s/%s/%s/?apikey=%s" % (b['state'], b['session'], urllib.quote(b['bill_id']), settings.SUNLIGHT_API_KEY) with scraper.urlopen(url) as bill_page: bill = json.loads(bill_page) process_bill(bill) else: pattern = os.path.join(data_dir, state, 'bills', state) sessions = Session.objects.values_list('name') _request_frequency = 1 _last_request = 0 for session in sessions: for chamber in ('upper', 'lower'): paths = glob.glob(os.path.join(pattern, session[0], chamber, '*')) for path in sorted(paths): now = time.time() diff = _request_frequency - (now - _last_request) if diff > 0: print "sleeping for %fs" % diff time.sleep(diff) _last_request = time.time() else: _last_request = now page = open(path, 'rb') bill = json.load(page) page.close() process_bill(bill)
def test_to_items_scout(): scraper = Scraper() page = FirstPage() items = list(page._to_items(scraper, scout=True)) assert len(items) == 3 assert items[0] == { "data": { "first": 1 }, "__next__": "SecondPage source=NullSource", } assert items[1] == { "data": { "first": 2 }, "__next__": "SecondPage source=NullSource", } assert items[2] == { "data": { "first": 3 }, "__next__": "SecondPage source=NullSource", }
def shell(url: str, verb: str, scraper: Scraper) -> None: """ Start a session to interact with a particular page. """ try: from IPython import embed # type: ignore except ImportError: # pragma: no cover click.secho("shell command requires IPython", fg="red") return # import selectors so they can be used without import from .selectors import SelectorError, XPath, SimilarLink, CSS # noqa resp = scraper.request(verb, url) root = lxml.html.fromstring(resp.content) # noqa click.secho(f"spatula {VERSION} shell", fg="blue") click.secho("available selectors: CSS, SimilarLink, XPath", fg="blue") click.secho("local variables", fg="green") click.secho("---------------", fg="green") click.secho("url: %s" % url, fg="green") click.secho("resp: requests Response instance", fg="green") click.secho(f"root: `lxml HTML element` <{root.tag}>", fg="green") embed()
def newfunc( header: typing.List[str], retries: int, retry_wait: int, rpm: int, timeout: int, user_agent: str, verbosity: int, verify: bool, fastmode: bool, **kwargs: str, ) -> None: scraper = Scraper( requests_per_minute=rpm, retry_attempts=retries, retry_wait_seconds=retry_wait, verify=verify, ) scraper.timeout = timeout scraper.user_agent = user_agent # only update headers, don't overwrite defaults scraper.headers.update( {k.strip(): v.strip() for k, v in [h.split(":") for h in header]}) if fastmode: scraper.cache_storage = SQLiteCache("spatula-cache.db") scraper.cache_write_only = False if verbosity == -1: level = logging.INFO if func.__name__ != "test" else logging.DEBUG elif verbosity == 0: # pragma: no cover level = logging.ERROR elif verbosity == 1: # pragma: no cover level = logging.INFO elif verbosity >= 2: # pragma: no cover level = logging.DEBUG if verbosity < 3: # replace parent library logging logging.getLogger("scrapelib").setLevel(logging.ERROR) logging.getLogger("urllib3").setLevel(logging.WARNING) logging.basicConfig(level=level) return func(**kwargs, scraper=scraper)
SYNONYMS = [('isis', 'isil', 'islamic state'), ('veteran', 'veterans', 'shinseki', 'va'), ('affordable care act', 'obamacare', 'healthcare', 'health care', 'insurance'), ('ukraine', 'ukrainian', 'crimea'), ('unemployed', 'unemployment')] #('palestine', 'palestinians'), #('israel', 'israeli', 'palestine', 'palestinians'), #('iraq', 'iraqis', 'iraqs'), #('executive order', 'executive action'), #('economy', 'economic'), ROOT_URL = 'http://www.whitehouse.gov/briefing-room/press-briefings' CSV_PATH = 'briefing_links.csv' s = Scraper(requests_per_minute=60) s.cache_storage = FileCache('press_briefing_cache') s.cache_write_only = False @task(default=True) def update(): """ Stub function for updating app-specific data. """ #update_featured_social() @task def scrape_briefings(): for index in range(0, 22):
# reading in what was output on the last run can make things more efficient. # import unicodecsv from lxml.html import fromstring from urlparse import urljoin from urllib import quote_plus import re import os, os.path OUTPUT_FILE = "zctas.csv" from scrapelib import Scraper, FileCache, HTTPError BASE_URL = 'https://en.wikipedia.org/wiki/' # would like to follow robots, but think that the robots parser is broken... s = Scraper(requests_per_minute=90, follow_robots=False) s.cache_storage = FileCache('../wikipedia_cache') s.cache_write_only = False def test_zips(): existing = {} if os.path.exists(OUTPUT_FILE): for row in unicodecsv.DictReader(open(OUTPUT_FILE)): existing[row['zip']] = row os.rename(OUTPUT_FILE, OUTPUT_FILE + '.bak') r = unicodecsv.DictReader(open("2013_Gaz_zcta_national.txt"), delimiter="\t") f = r.fieldnames writer = unicodecsv.DictWriter(open(OUTPUT_FILE, "w"), ['zip', 'wiki_url']) writer.writerow({'zip': 'zip', 'wiki_url': 'wiki_url'})
def test_source_no_timeout(): source = URL("https://httpbin.org/delay/1") assert source.get_response(Scraper()).status_code == 200
def test_source_timeout(): source = URL("https://httpbin.org/delay/1", timeout=0.1) with pytest.raises(OSError): source.get_response(Scraper())
import codecs import csv import urlparse from scrapelib import Scraper, FileCache, HTTPError s = Scraper(requests_per_minute=60) s.cache_storage = FileCache('walmart_cache') s.cache_write_only = False def read_csv(): with open('scrapedsearch.csv', 'rb') as f: reader = csv.DictReader( f, fieldnames=["title", "link", "date", "description"]) for row in reader: print row if row['link'] != 'link': scrape_release(row) def scrape_release(row): path = urlparse.urlparse(row['link'])[2] components = path.split('/') if len(components) > 4: year = components[-4] month = components[-3] day = components[-2] slug = components[-1] filename = '%s-%s-%s-%s' % (year, month, day, slug)
#!/usr/bin/env python import unicodecsv import statestyle import re from lxml.html import fromstring from urlparse import urljoin # I copied this code from scrape_states.py from scrapelib import Scraper, FileCache s = Scraper(requests_per_minute=60, follow_robots=False) s.cache_storage = FileCache('wikipedia_cache') s.cache_write_only = False # My Stuff CD_LIST = 'https://en.wikipedia.org/wiki/List_of_United_States_congressional_districts' NON_VOTING = ['American Samoa', 'District of Columbia', 'Guam', 'Northern Mariana Islands', 'Puerto Rico', 'United States Virgin Islands'] NOT_STATES = ['Philippines', 'U.S. Virgin Islands'] def parse_cd_file(): writer = unicodecsv.writer(open('cd_wiki_data.csv', 'w')) writer.writerow(['full_geoid', 'wiki_url']) response = s.urlopen(CD_LIST) doc = fromstring(response) for h2 in doc.findall('.//h2')[2:59]: for span in h2.find_class('mw-headline'): if span.text_content() in NOT_STATES: break
('veteran', 'veterans', 'shinseki', 'va'), ('affordable care act', 'obamacare', 'healthcare', 'health care', 'insurance'), ('ukraine', 'ukrainian', 'crimea'), ('unemployed', 'unemployment') ] #('palestine', 'palestinians'), #('israel', 'israeli', 'palestine', 'palestinians'), #('iraq', 'iraqis', 'iraqs'), #('executive order', 'executive action'), #('economy', 'economic'), ROOT_URL = 'http://www.whitehouse.gov/briefing-room/press-briefings' CSV_PATH = 'briefing_links.csv' s = Scraper(requests_per_minute=60) s.cache_storage = FileCache('press_briefing_cache') s.cache_write_only = False @task(default=True) def update(): """ Stub function for updating app-specific data. """ #update_featured_social() @task def scrape_briefings(): for index in range(0, 22): list = '%s?page=%i' % (ROOT_URL, index) print 'parsing %s' % list