コード例 #1
0
def get_cities():
    localities = []
    cities = []
    
    locality_list = scrapertools.get_content(BASE_URL + LOCALITY_LIST_URL)
    locality_a_tags = locality_list.body.div.img.find_all_next('a')
    locality_urls = list(set(locality['href'][1:] for locality in locality_a_tags))
    
    for locality_url in locality_urls:
        locality_main = scrapertools.get_content(BASE_URL + locality_url + LOCALITY_MAIN_URL)
        locality_name = locality_main.find('h3').string
        localities.append({
            'name': locality_name,
            'url': locality_url
        })
        print 'Locality: ' + locality_name

    for locality in localities:
        city_list = scrapertools.get_content(BASE_URL + locality['url'] + CITY_LIST_URL)
        city_a_tags = city_list.find_all('a')
        
        for city in city_a_tags:
            name = str(city.string).strip()
            print 'Loading {0} ({1})'.format(name, locality['name'])
            cities.append({
                'name': name,
                'locality': locality['name'],
                'baseUrl': city['href'][:city['href'].find('Food-List-ByName')],
                'establishmentUrl': city['href'].replace('Count=30', '')
            })
    
    return cities
コード例 #2
0
def get_establishments(city):
    establishments_found = []

    start = 1
    count = 1000
    more = True

    while more:
        establishment_list = scrapertools.get_content(BASE_URL + city['establishmentUrl']+'&start='+str(start)+'&count='+str(count))
        if establishment_list.find(text='No documents found') is not None:
            more = False
            continue
        start += count
        establishments = establishment_list.find_all('tr')
        for establishment in establishments:
            details = establishment.find_all('td')
            if len(details) == 4 and details[0] is not None and details[0].a is not None:
                date = (None if scrapertools.get_text(details[3]) is None
                        else datetime.strptime(scrapertools.get_text(details[3]), '%d-%b-%Y'))
                # Removes establishment IDs and newlines from the establishment name
                name = re.sub('(#|\()(\s)*([0-9][0-9]-[0-9][0-9][0-9][0-9])(\))?', '', scrapertools.get_text(details[0]))
                name = re.sub('(\n)+', ' ', name)
                address = scrapertools.get_text(details[2])
                slug_id = slugify(name.strip() + ' ' + address)
                establishments_found.append({
                    'slug': slug_id,
                    'name': name.strip(),
                    'url': details[0].a['href'],
                    'address': address,
                    'locality': city['locality'],
                    'last_inspection_date': date,
                    'baseUrl': city['baseUrl'],
                    'inserted': datetime.now()
                })
    return establishments_found
コード例 #3
0
def get_establishments(city):
    establishments_found = []

    start = 1
    count = 1000
    more = True

    while more:
        establishment_list = scrapertools.get_content(BASE_URL + city['establishmentUrl']+'&start='+str(start)+'&count='+str(count))
        if establishment_list.find(text='No documents found') is not None:
            more = False
            continue
        start += count
        establishments = establishment_list.find_all('tr')
        for establishment in establishments:
            details = establishment.find_all('td')
            if len(details) == 4 and details[0] is not None and details[0].a is not None:
                date = (None if scrapertools.get_text(details[3]) is None
                        else datetime.strptime(scrapertools.get_text(details[3]), '%d-%b-%Y'))
                establishments_found.append({
                    'name': scrapertools.get_text(details[0]),
                    'url': details[0].a['href'],
                    'address': scrapertools.get_text(details[2]),
                    'locality': city['locality'],
                    'last_inspection_date': date
                })


    return establishments_found
コード例 #4
0
def get_establishment_details(establishment):
    establishment_details = scrapertools.get_content(BASE_URL + establishment['url'])
    establishment['city'] = re.sub('(<(/)?br>)|(\r)|(\n)',
                                   '',
                                   str(establishment_details.find(text=re.compile('Facility Location')).parent.next_sibling.find('br')))
    establishment['type'] = establishment_details.find(text=re.compile("Facility Type")).parent.next_sibling.string

    return establishment
コード例 #5
0
def get_cities():
    cities_found = []
    city_names = []
    
    locality_list = scrapertools.get_content(BASE_URL + LOCALITY_LIST_URL)
    localities = locality_list.body.div.img.find_all_next('a')

    for locality in localities:
        city_list = scrapertools.get_content(BASE_URL + locality['href'] + CITY_LIST_URL)
        cities = city_list.find_all('a')
        
        for city in cities:
            name = str(city.string).strip()
            if name not in city_names:
                print 'Loading ' + name
                city_names.append(name)
                cities_found.append({
                    'name': name,
                    'locality': str(locality.string).strip(),
                    'baseUrl': city['href'][:city['href'].find('Food-List-ByName')],
                    'establishmentUrl': city['href'].replace('Count=30', '')
                })

    return cities_found
コード例 #6
0
def get_establishment_details(establishment):
    establishment_details = scrapertools.get_content(BASE_URL + establishment['url'])
    for linebreak in establishment_details.find_all('br'):
        linebreak.extract()
    
    try:
        establishment['city'] = establishment_details.find(text=re.compile('^Facility Location')).parent.next_sibling.next_sibling.string
    except:
        establishment['city'] = 'Unknown'
    
    try:
        establishment['type'] = establishment_details.find(text=re.compile('^Facility Type')).parent.next_sibling.string
    except:
        establishment['type'] = 'Unknown'
    
    return establishment
コード例 #7
0
def get_inspections(establishment, city_url):
    inspections_found = []
    
    establishment_details = scrapertools.get_content(BASE_URL + establishment['url'])
    inspections = establishment_details.find_all(text='Inspection Type')[0].find_parent('tr').find_all_next('tr')

    for inspection in inspections:
        details = inspection.find_all('td')

        if details[0].a is None:
            continue
        
        violations = get_violations(BASE_URL + city_url + '/' + details[0].a['href'])
        inspections_found.append({
            'type': scrapertools.get_text(details[0]),
            'date': datetime.strptime(scrapertools.get_text(details[1]), '%d-%b-%Y'),
            'violations': violations
        })
    
    return inspections_found
コード例 #8
0
def get_violations(inspection_details_url):
    violations_found = []
    
    inspection_details = scrapertools.get_content(inspection_details_url)

    violations = inspection_details.find(text='Violations:').find_next('table')

    if violations is None:
        return []
    violations = violations.find('tr').find_next_siblings()
    for violation in violations:
        details = violation.find_all('td')

        violations_found.append({
            'code': scrapertools.get_all_text(details[0])[0],
            'repeat': any(['Repeat' in tag.string for tag in details[1].contents if tag.name == 'b']),
            'critical': any(['Critical' in tag.string for tag in details[1].contents if tag.name == 'b']),
            'corrected': any(['Corrected' in tag.string for tag in details[1].contents if tag.name == 'b']),
            'correction': ' '.join([tag.string for tag in details[1].contents if tag.name == 'font']).strip(),
            'observation': ' '.join([tag.string for tag in details[1].contents if tag.name == None]).strip()
        })
    return violations_found