def scrape_facility(facility_url): try: facility = {} facility['url'] = facility_url facility['_id'] = facility['url'] facility_resp = requests.get(facility['url']) # easier to get name, location, type from the inspection page # TODO: refactor to pull from facility page to avoid skipping # facilities with no inspections inspection_links = pq(facility_resp.content).find( 'div#inspectionHistory a') inspection_url = THD_ROOT + '/' + pq(inspection_links[0]).attr('href') time.sleep(SECONDS_THROTTLE) inspection_resp = requests.get(inspection_url) doc = pq(inspection_resp.content) facility['name'] = doc.find( 'div#inspectionDetail h3').text() m = re.search('Location: (?P<location>.*)<br/>', doc.find('div#inspectionDetail').html()) facility['location'] = m.group('location').strip() info = doc.find('div#inspectionInfo tr td') for (counter, pair) in enumerate(grouper(info, 2)): value = pq(pair[1]).text() if counter == 1: facility['type'] = value if 'MAPQUEST_API_KEY' in os.environ: mq = geocoders.MapQuest(os.environ['MAPQUEST_API_KEY']) try: (place, (lat, long)) = mq.geocode(facility.get('location', '')) facility['latitude'] = lat facility['longitude'] = long except: logger.exception("Could not geocode location '%s' for %s" % (facility.get('location', ''), facility.get('name', ''))) print "facility: %s" % facility facility['id'] = save_facility(facility) return facility, facility_resp except: logger.exception("Could not scrape facility %s" % facility.get('url', ''))
def scrape_inspections(startrow): try: SEARCH_PARAMS.update({'startrow': startrow}) search_resp = requests.post(THD_ROOT + '/index.cfm', data=SEARCH_PARAMS) facility_links = pq(search_resp.content).find( 'div#searchResults a.resultMore') for f_link in facility_links: facility_url = THD_ROOT + '/' + pq(f_link).attr('href') time.sleep(SECONDS_THROTTLE) facility, facility_resp = scrape_facility(facility_url) inspection_links = pq(facility_resp.content).find( 'div#inspectionHistory a') for i_link in inspection_links: inspection_url = THD_ROOT + '/' + pq(i_link).attr('href') inspection, inspection_resp = scrape_inspection(inspection_url, facility) scrape_violations(inspection_resp.content, inspection) time.sleep(SECONDS_THROTTLE) if 'MAPQUEST_API_KEY' in os.environ: mq = geocoders.MapQuest(os.environ['MAPQUEST_API_KEY']) try: (place, (lat, long)) = mq.geocode(facility.get('location', '')) facility['latitude'] = lat facility['longitude'] = long except: logger.exception("Could not geocode location '%s' for %s" % (facility.get('location', ''), facility.get('name', ''))) print "facility: %s" % facility facility['id'] = save_facility(facility) except: logger.exception("Could not scrape at startrow: %s" % startrow)