コード例 #1
0
def scrape_facility(facility_url):
    try:
        facility = {}
        facility['url'] = facility_url
        facility['_id'] = facility['url']
        facility_resp = requests.get(facility['url'])
        # easier to get name, location, type from the inspection page
        # TODO: refactor to pull from facility page to avoid skipping
        # facilities with no inspections
        inspection_links = pq(facility_resp.content).find(
            'div#inspectionHistory a')
        inspection_url = THD_ROOT + '/' + pq(inspection_links[0]).attr('href')
        time.sleep(SECONDS_THROTTLE)
        inspection_resp = requests.get(inspection_url)
        doc = pq(inspection_resp.content)
        facility['name'] = doc.find(
            'div#inspectionDetail h3').text()
        m = re.search('Location: (?P<location>.*)<br/>',
                     doc.find('div#inspectionDetail').html())
        facility['location'] = m.group('location').strip()
        info = doc.find('div#inspectionInfo tr td')
        for (counter, pair) in enumerate(grouper(info, 2)):
            value = pq(pair[1]).text()
            if counter == 1:
                facility['type'] = value
        if 'MAPQUEST_API_KEY' in os.environ:
            mq = geocoders.MapQuest(os.environ['MAPQUEST_API_KEY'])
            try:
                (place, (lat, long)) = mq.geocode(facility.get('location', ''))
                facility['latitude'] = lat
                facility['longitude'] = long
            except:
                logger.exception("Could not geocode location '%s' for %s" %
                                 (facility.get('location', ''),
                                  facility.get('name', '')))
        print "facility: %s" % facility
        facility['id'] = save_facility(facility)
        return facility, facility_resp
    except:
            logger.exception("Could not scrape facility %s" %
                             facility.get('url', ''))
コード例 #2
0
def scrape_inspections(startrow):
    try:
        SEARCH_PARAMS.update({'startrow': startrow})
        search_resp = requests.post(THD_ROOT + '/index.cfm', data=SEARCH_PARAMS)
        facility_links = pq(search_resp.content).find(
            'div#searchResults a.resultMore')
        for f_link in facility_links:
            facility_url = THD_ROOT + '/' + pq(f_link).attr('href')
            time.sleep(SECONDS_THROTTLE)
            facility, facility_resp = scrape_facility(facility_url)

            inspection_links = pq(facility_resp.content).find(
                'div#inspectionHistory a')
            for i_link in inspection_links:
                inspection_url = THD_ROOT + '/' + pq(i_link).attr('href')
                inspection, inspection_resp = scrape_inspection(inspection_url,
                                                                facility)

                scrape_violations(inspection_resp.content, inspection)
                time.sleep(SECONDS_THROTTLE)

            if 'MAPQUEST_API_KEY' in os.environ:
                mq = geocoders.MapQuest(os.environ['MAPQUEST_API_KEY'])
                try:
                    (place, (lat, long)) = mq.geocode(facility.get('location', ''))
                    facility['latitude'] = lat
                    facility['longitude'] = long
                except:
                    logger.exception("Could not geocode location '%s' for %s" %
                                     (facility.get('location', ''),
                                      facility.get('name', '')))

            print "facility: %s" % facility
            facility['id'] = save_facility(facility)
    except:
        logger.exception("Could not scrape at startrow: %s" % startrow)