def session_list(): import lxml.html from scrapelib import urlopen from datetime import date import string # Start from City Clerk page city_clerk_url = 'http://sanjoseca.gov/index.aspx?NID=145' city_clerk_doc = lxml.html.fromstring(urlopen(city_clerk_url)) city_clerk_doc.make_links_absolute(city_clerk_url) # Find current year current_year_url = city_clerk_doc.xpath( '//td[//span]//a[contains(text(),"Council Agendas 2")]/@href')[0] current_year_doc = lxml.html.fromstring(urlopen(current_year_url)) current_year_doc.make_links_absolute(current_year_url) current_year_text = current_year_doc.xpath( '//tr[contains(@class, "telerik-reTableHeaderRow")]//td[contains(text(),"COUNCIL AGENDAS")]/text()' )[0] current_year = string.split(current_year_text)[0] # Find agenda years council_agendas = map( string.strip, current_year_doc.xpath( '//a[contains(text(),"Council Agendas 2")]/text()')) agenda_years = map(strip_council_agendas_prefix, council_agendas) # Find old archived years archives_url = current_year_doc.xpath( "//a[contains(text(),'Archived Agendas')]/@href")[0] archives_doc = lxml.html.fromstring(urlopen(archives_url)) archives_doc.make_links_absolute(archives_url) archived_council_agendas = map( string.strip, archives_doc.xpath( '//table[./tr/td/div/strong[text()="Council Agendas/Synopses"]]//a/text()' )) while archived_council_agendas.count('') > 0: archived_council_agendas.remove('') archived_council_minutes = map( string.strip, archives_doc.xpath( '//table[./tr/td/div/strong[text()="Council Meeting Minutes"]]//a/text()' )) while archived_council_minutes.count('') > 0: archived_council_minutes.remove('') aggregated_years = [ current_year ] + agenda_years + archived_council_agendas + archived_council_minutes unique_years = list(set(aggregated_years)) int_years = map(int, unique_years) int_years.sort() session_years = map(str, int_years) return session_years
def s3_get(abbr, doc): if settings.AWS_BUCKET: k = boto.s3.key.Key(s3bucket) k.key = 'documents/{0}/{1}'.format(abbr, doc['doc_id']) # try and get the object, if it doesn't exist- pull it down try: return k.get_contents_as_string() except: data = scrapelib.urlopen(doc['url'].replace(' ', '%20')) content_type = data.response.headers.get('content-type') if not content_type: url = doc['url'].lower() if url.endswith('htm') or doc['url'].endswith('html'): content_type = 'text/html' elif url.endswith('pdf'): content_type = 'application/pdf' headers = { 'x-amz-acl': 'public-read', 'Content-Type': content_type } k.set_contents_from_string(data.bytes, headers=headers) _log.debug('pushed %s to s3 as %s', doc['url'], doc['doc_id']) return data.bytes else: return scrapelib.urlopen(doc['url'].replace(' ', '%20')).bytes
def validate_api(abbr, schema_dir=None): metadata_schema = get_json_schema("metadata", schema_dir) path = "metadata/%s" % abbr url = api_url(path) json_response = scrapelib.urlopen(url) validictory.validate(json.loads(json_response), metadata_schema, validator_cls=APIValidator) bill_schema = get_json_schema("bill", schema_dir) level = metadata(abbr)['level'] spec = {'level': level, level: abbr} total_bills = db.bills.find(spec).count() for i in xrange(0, 100): bill = db.bills.find(spec)[random.randint(0, total_bills - 1)] path = "bills/%s/%s/%s/%s" % (abbr, bill['session'], bill['chamber'], bill['bill_id']) url = api_url(path) json_response = scrapelib.urlopen(url) validictory.validate(json.loads(json_response), bill_schema, validator_cls=APIValidator) legislator_schema = get_json_schema("legislator", schema_dir) for legislator in db.legislators.find(spec): path = 'legislators/%s' % legislator['_id'] url = api_url(path) json_response = scrapelib.urlopen(url) validictory.validate(json.loads(json_response), legislator_schema, validator_cls=APIValidator) committee_schema = get_json_schema("committee", schema_dir) for committee in db.committees.find(spec): path = "committees/%s" % committee['_id'] url = api_url(path) json_response = scrapelib.urlopen(url) validictory.validate(json.loads(json_response), committee_schema, validator_cls=APIValidator) event_schema = get_json_schema("event", schema_dir) total_events = db.events.find(spec).count() if total_events: for i in xrange(0, 10): event = db.events.find(spec)[random.randint(0, total_events - 1)] path = "events/%s" % event['_id'] url = api_url(path) json_response = scrapelib.urlopen(url) validictory.validate(json.loads(json_response), event_schema, validator_cls=APIValidator)
def session_list(): import scrapelib import lxml.html # uses urllib because httplib2 has a compression issue on this page html = scrapelib.urlopen('http://legis.sd.gov/Legislative_Session/Menu.aspx') doc = lxml.html.fromstring(html) return doc.xpath('//div[@id="ContentPlaceHolder1_BlueBoxLeft"]//ul/li/a/div/text()')
def scrape_committees_html(year, chamber, doc): name_dict = defaultdict(set) tds = doc.xpath('//td[@valign="top"]')[3:] cache = [] for td in tds: for name_dict, c in _committees_td(td, chamber, url, name_dict): if c not in cache: cache.append(c) yield name_dict, c # Get the joint approps subcommittees during the upper scrape. if chamber == 'upper': url = committee_urls['joint'][year] html = scrapelib.urlopen(url) name_dict = defaultdict(set) doc = lxml.html.fromstring(html) tds = doc.xpath('//td[@valign="top"]')[3:] cache = [] for td in tds: for name_dict, c in _committees_td(td, 'joint', url, name_dict): if c not in cache: cache.append(c) # These are subcommittees, so a quick switcheroo of the names: c['subcommittee'] = c['committee'] c['committee'] = 'Appropriations' yield name_dict, c
def scrape_committees_html(year, chamber, doc): name_dict = defaultdict(set) tds = doc.xpath('//td[@valign="top"]')[3:] cache = [] for td in tds: for name_dict, c in _committees_td(td, chamber, url, name_dict): if c not in cache: cache.append(c) yield name_dict, c # Get the joint approps subcommittees during the upper scrape. if chamber == 'upper': url = committee_urls['joint'][year] html = scrapelib.urlopen(url) name_dict = defaultdict(set) doc = lxml.html.fromstring(html) tds = doc.xpath('//td[@valign="top"]')[3:] cache = [] for td in tds: for name_dict, c in _committees_td(td, 'joint', url, name_dict): if c not in cache: cache.append(c) # These are subcommittees, so a quick switcheroo of the # names: c['subcommittee'] = c['committee'] c['committee'] = 'Appropriations' yield name_dict, c
def main(): conn = pymongo.Connection(settings.MONGO_HOST, settings.MONGO_PORT) tweets = conn['openstates_web']['tweets'] data = urlopen('http://api.twitter.com/1/statuses/user_timeline.json?screen_name=openstates&count=1&trim_user=1') data = json.loads(data) tweets.drop() tweets.insert(data, safe=True)
def session_list(): import scrapelib from billy.scrape.utils import url_xpath data = scrapelib.urlopen( 'http://www.azleg.gov/xml/sessions.asp?sort=SessionID') doc = lxml.html.fromstring(data.bytes) return doc.xpath('//session/@session_full_name')
def session_list(): import scrapelib text = scrapelib.urlopen('ftp://ftp.cga.ct.gov') sessions = [line.split()[-1] for line in text.splitlines()] sessions.remove('incoming') sessions.remove('pub') return sessions
def session_list(): import scrapelib from billy.scrape.utils import url_xpath data = scrapelib.urlopen("http://www.azleg.gov/xml/sessions.asp?sort=SessionID") doc = lxml.html.fromstring(data.bytes) return doc.xpath("//session/@session_full_name")
def session_list(): import scrapelib text = scrapelib.urlopen('ftp://ftp.cga.ct.gov') sessions = [line.split()[-1] for line in text.splitlines()] for not_session_name in ('incoming', 'pub', 'CGAAudio', 'rba', 'NCSL'): sessions.remove(not_session_name) return sessions
def session_list(): import scrapelib text = scrapelib.urlopen("ftp://ftp.cga.ct.gov") sessions = [line.split()[-1] for line in text.splitlines()] sessions.remove("incoming") sessions.remove("pub") return sessions
def session_list(): import scrapelib import lxml.html url = 'http://www.legis.nd.gov/assembly/' sessions = [] html = scrapelib.urlopen(url) doc = lxml.html.fromstring(html) doc.make_links_absolute(url) # go through links and look for pages that have an active Legislation: link for a in doc.xpath("//div[@class='linkblockassembly']/div/span/a"): ahtml = scrapelib.urlopen(a.get('href')) adoc = lxml.html.fromstring(ahtml) if adoc.xpath('//a[contains(@href, "leginfo")]'): sessions.append(a.text) return sessions
def main(): conn = pymongo.Connection(settings.MONGO_HOST, settings.MONGO_PORT) tweets = conn['openstates_web']['tweets'] data = urlopen( 'http://api.twitter.com/1/statuses/user_timeline.json?screen_name=openstates&count=1&trim_user=1' ) data = json.loads(data) tweets.drop() tweets.insert(data, safe=True)
def session_list(): import scrapelib import lxml.html # uses urllib because httplib2 has a compression issue on this page html = scrapelib.urlopen( 'http://legis.sd.gov/Legislative_Session/Menu.aspx') doc = lxml.html.fromstring(html) return doc.xpath( '//div[@id="ContentPlaceHolder1_BlueBoxLeft"]//ul/li/a/div/text()')
def session_list(): import lxml.html from scrapelib import urlopen from datetime import date import string # Start from City Clerk page city_clerk_url = 'http://sanjoseca.gov/index.aspx?NID=145' city_clerk_doc = lxml.html.fromstring(urlopen(city_clerk_url)) city_clerk_doc.make_links_absolute(city_clerk_url) # Find current year current_year_url = city_clerk_doc.xpath('//td[//span]//a[contains(text(),"Council Agendas 2")]/@href')[0] current_year_doc = lxml.html.fromstring(urlopen(current_year_url)) current_year_doc.make_links_absolute(current_year_url) current_year_text = current_year_doc.xpath('//tr[contains(@class, "telerik-reTableHeaderRow")]//td[contains(text(),"COUNCIL AGENDAS")]/text()')[0] current_year = string.split(current_year_text)[0] # Find agenda years council_agendas = map(string.strip, current_year_doc.xpath('//a[contains(text(),"Council Agendas 2")]/text()')) agenda_years = map(strip_council_agendas_prefix, council_agendas) # Find old archived years archives_url = current_year_doc.xpath("//a[contains(text(),'Archived Agendas')]/@href")[0] archives_doc = lxml.html.fromstring(urlopen(archives_url)) archives_doc.make_links_absolute(archives_url) archived_council_agendas = map(string.strip, archives_doc.xpath('//table[./tr/td/div/strong[text()="Council Agendas/Synopses"]]//a/text()')) while archived_council_agendas.count('') > 0: archived_council_agendas.remove('') archived_council_minutes = map(string.strip, archives_doc.xpath('//table[./tr/td/div/strong[text()="Council Meeting Minutes"]]//a/text()')) while archived_council_minutes.count('') > 0: archived_council_minutes.remove('') aggregated_years = [current_year] + agenda_years + archived_council_agendas + archived_council_minutes unique_years = list(set(aggregated_years)) int_years = map(int, unique_years) int_years.sort() session_years = map(str, int_years) return session_years
def session_list(): import scrapelib import lxml.html url = 'http://www.legis.nd.gov/assembly/' sessions = [] html = scrapelib.urlopen(url) doc = lxml.html.fromstring(html) doc.make_links_absolute(url) return doc.xpath("//div[@class='view-content']//a/text()")
def session_list(): import lxml.html from scrapelib import urlopen from datetime import date url = 'http://www3.sanjoseca.gov/clerk/agenda.asp' doc = lxml.html.fromstring(urlopen(url)) doc.make_links_absolute(url) timespan = next(text for text in doc.xpath('//text()[contains(.,"Meeting")][contains(.,"Minutes")]/following::text()') if text.strip()) start = int(timespan.split('-', 1)[0]) return map(str, range(start, date.today().year))
def scrape_committees(year, chamber): '''Since the legislator names aren't properly capitalized in the csv file, scrape the committee page and use the names listed there instead. ''' url = committee_urls[chamber][year] html = scrapelib.urlopen(url) name_dict = defaultdict(set) doc = lxml.html.fromstring(html) tds = doc.xpath('//td[@valign="top"]')[3:] cache = [] for td in tds: for name_dict, c in _committees_td(td, chamber, url, name_dict): if c not in cache: cache.append(c) yield name_dict, c # Get the joint approps subcommittees during the upper scrape. if chamber == 'upper': url = committee_urls['joint'][year] html = scrapelib.urlopen(url) name_dict = defaultdict(set) doc = lxml.html.fromstring(html) tds = doc.xpath('//td[@valign="top"]')[3:] cache = [] for td in tds: for name_dict, c in _committees_td(td, 'joint', url, name_dict): if c not in cache: cache.append(c) # These are subcommittees, so a quick switcheroo of the names: c['subcommittee'] = c['committee'] c['committee'] = 'Appropriations' yield name_dict, c
def scrape_story(url, cache_dir): story_doc = lxml.html.fromstring(scrapelib.urlopen(url)) text = story_doc.xpath('//div[@class="storytext xcontrast_txt"]')[0].text_content() with open(cache_dir + '/' + hashlib.sha1(url).hexdigest(), 'w') as file: file.write(text.encode('utf8', 'ignore')) print url # check for next page base_url = 'http://www.fanfiction.net' next = story_doc.xpath('//input[contains(@value, "Next")]/@onclick') if next: url = base_url + next[0].replace("self.location='", '').strip("'") scrape_story(url, cache_dir)
def s3_get(abbr, doc): if settings.AWS_BUCKET: k = boto.s3.key.Key(s3bucket) k.key = 'documents/{0}/{1}'.format(abbr, doc['doc_id']) # try and get the object, if it doesn't exist- pull it down try: return k.get_contents_as_string() except: data = scrapelib.urlopen(doc['url'].replace(' ', '%20')) content_type = data.response.headers.get('content-type') if not content_type: url = doc['url'].lower() if url.endswith('htm') or doc['url'].endswith('html'): content_type = 'text/html' elif url.endswith('pdf'): content_type = 'application/pdf' headers = {'x-amz-acl': 'public-read', 'Content-Type': content_type} k.set_contents_from_string(data.bytes, headers=headers) _log.debug('pushed %s to s3 as %s', doc['url'], doc['doc_id']) return data.bytes else: return scrapelib.urlopen(doc['url'].replace(' ', '%20')).bytes
def s3_get(id): k = boto.s3.key.Key(s3bucket) k.key = 'documents/{0}/{1}'.format(id[0:2].lower(), id) # try and get the object, if it doesn't exist- pull it down try: return k.get_contents_as_string() except: doc = db.tracked_versions.find_one(id) if not doc: return None data = scrapelib.urlopen(doc['url'].replace(' ', '%20')) content_type = data.response.headers['content-type'] headers = {'x-amz-acl': 'public-read', 'Content-Type': content_type} k.set_contents_from_string(data.bytes, headers=headers) log.debug('pushed %s to s3 as %s', doc['url'], id) return data.bytes
def scrape_committees(year, chamber): '''Since the legislator names aren't properly capitalized in the csv file, scrape the committee page and use the names listed there instead. ''' url = committee_urls[chamber][year] html = scrapelib.urlopen(url) name_dict = defaultdict(set) doc = lxml.html.fromstring(html) tds = doc.xpath('//td[@valign="top"]')[3:] cache = [] for td in tds: for name_dict, c in _committees_td(td, chamber, url, name_dict): if c not in cache: cache.append(c) yield name_dict, c
def handle_noargs(self, **options): url = 'http://www.wwe.com/superstars' data = scrapelib.urlopen(url) doc = lxml.html.fromstring(data) doc.make_links_absolute(url) for div in doc.xpath('//div[starts-with(@class, "star ")]'): cssclass = div.get('class') if 'letter-champion' in cssclass: continue # get division divisions = ('divas', 'raw', 'smackdown') for division in divisions: if division in cssclass: break else: division = 'other' name = div.xpath('h2')[0].text_content().strip() url = div.xpath('a/@href')[0] id = url.rsplit('/', 1)[-1] photo_url = 'http://wwe.com' + div.xpath('a/img/@data-fullsrc')[0] if Star.objects.filter(id=id).count(): star = Star.objects.get(id=id) if star.name != name: print('updating {0} name to {1}'.format(star.name, name)) star.name = name if star.division != division: print('updating {0} division to {1}'.format(star.name, division)) star.division = division if star.photo_url != photo_url: print('updating {0} photo to {1}'.format(star.name.encode('utf8'), photo_url)) star.photo_url = photo_url star.save() else: print('adding {0}'.format(name)) Star.objects.create(id=id, name=name, division=division, photo_url=photo_url)
def url_xpath(url, path): data = scrapelib.urlopen(url) if (data is None): print "Could not get data from url:%s" % (url) raise NoData(url) doc = lxml.html.fromstring(data) if (doc is None): print "Could not decode XML Doc:%s" % (data) raise NoDoc(data) # print etree.tostring(doc) # print doc # print "Check path:%s Doc:%s" % (path,doc) result = doc.xpath(path) # print result # print len(result) if (result is None): # print doc # print doc.tag # print etree.tostring(doc) # print "Xpath failed path:%s Doc:%s" % (path,doc) print "Xpath failed" raise NoXpath(data) # print "url_xpath %s" % result # exc_type, exc_value, exc_traceback = sys.exc_info() # traceback.print_exc() # traceback.print_tb(exc_traceback, file=sys.stdout) return result
def _get_url(url): return lxml.html.fromstring(scrapelib.urlopen(url))
def ct_session_info(): html = scrapelib.urlopen("ftp://ftp.cga.ct.gov") sessions = [line.split()[-1] for line in html.splitlines()] sessions.pop() # remove pub/ return sessions, sessions[-1]
def lxmlize(self, url, encoding = 'utf-8'): entry = urlopen(url).encode(encoding) return lxml.html.fromstring(entry)
def scrape_stories(url, cache_dir): os.path.exists(cache_dir) or os.makedirs(cache_dir) doc = lxml.html.fromstring(scrapelib.urlopen(url)) doc.make_links_absolute(url) for link in doc.xpath('//a[@class="stitle"]/@href'): scrape_story(link, cache_dir)
def url_xpath(url): html = scrapelib.urlopen(url) html = html.decode('latin-1') doc = lxml.html.fromstring(html) return doc
def url_xpath(url, path): import scrapelib import lxml.html doc = lxml.html.fromstring(scrapelib.urlopen(url)) return doc.xpath(path)
def url2lxml(url): html = urlopen(url).decode('latin-1') return lxml.html.fromstring(html)
def validate_xml(url, schema): response = scrapelib.urlopen(url + "&format=xml") xml = lxml.etree.fromstring(response) for child in xml.xpath("/results/*"): schema.assertValid(child)
def validate_api(state): cwd = os.path.split(__file__)[0] schema_dir = os.path.join(cwd, "../schemas/api/") xml_schema = get_xml_schema() with open(os.path.join(schema_dir, "metadata.json")) as f: metadata_schema = json.load(f) path = "metadata/%s" % state url = api_url(path) json_response = scrapelib.urlopen(url) validictory.validate(json.loads(json_response), metadata_schema, validator_cls=APIValidator) validate_xml(url, xml_schema) with open(os.path.join(schema_dir, "bill.json")) as f: bill_schema = json.load(f) bill_spec = {'state': state} total_bills = db.bills.find(bill_spec).count() for i in xrange(0, 100): bill = db.bills.find(bill_spec)[random.randint(0, total_bills - 1)] path = "bills/%s/%s/%s/%s" % (state, bill['session'], bill['chamber'], bill['bill_id']) url = api_url(path) json_response = scrapelib.urlopen(url) validictory.validate(json.loads(json_response), bill_schema, validator_cls=APIValidator) validate_xml(url, xml_schema) with open(os.path.join(schema_dir, "legislator.json")) as f: legislator_schema = json.load(f) for legislator in db.legislators.find({'state': state}): path = 'legislators/%s' % legislator['_id'] url = api_url(path) json_response = scrapelib.urlopen(url) validictory.validate(json.loads(json_response), legislator_schema, validator_cls=APIValidator) validate_xml(url, xml_schema) with open(os.path.join(schema_dir, "committee.json")) as f: committee_schema = json.load(f) for committee in db.committees.find({'state': state}): path = "committees/%s" % committee['_id'] url = api_url(path) json_response = scrapelib.urlopen(url) validictory.validate(json.loads(json_response), committee_schema, validator_cls=APIValidator) validate_xml(url, xml_schema) with open(os.path.join(schema_dir, "event.json")) as f: event_schema = json.load(f) total_events = db.events.find({'state': state}).count() if total_events: for i in xrange(0, 10): event = db.events.find({'state': state})[ random.randint(0, total_events - 1)] path = "events/%s" % event['_id'] url = api_url(path) json_response = scrapelib.urlopen(url) validictory.validate(json.loads(json_response), event_schema, validator_cls=APIValidator) validate_xml(url, xml_schema)
def url2lxml(url): html = urlopen(url) return lxml.html.fromstring(html)
def url_xpath(url): html = scrapelib.urlopen(url) html = html doc = lxml.html.fromstring(html) return doc