def get_establishments(city): establishments_found = [] start = 1 count = 1000 more = True while more: establishment_list = scrapertools.get_content(BASE_URL + city['establishmentUrl']+'&start='+str(start)+'&count='+str(count)) if establishment_list.find(text='No documents found') is not None: more = False continue start += count establishments = establishment_list.find_all('tr') for establishment in establishments: details = establishment.find_all('td') if len(details) == 4 and details[0] is not None and details[0].a is not None: date = (None if scrapertools.get_text(details[3]) is None else datetime.strptime(scrapertools.get_text(details[3]), '%d-%b-%Y')) # Removes establishment IDs and newlines from the establishment name name = re.sub('(#|\()(\s)*([0-9][0-9]-[0-9][0-9][0-9][0-9])(\))?', '', scrapertools.get_text(details[0])) name = re.sub('(\n)+', ' ', name) address = scrapertools.get_text(details[2]) slug_id = slugify(name.strip() + ' ' + address) establishments_found.append({ 'slug': slug_id, 'name': name.strip(), 'url': details[0].a['href'], 'address': address, 'locality': city['locality'], 'last_inspection_date': date, 'baseUrl': city['baseUrl'], 'inserted': datetime.now() }) return establishments_found
def get_establishments(city): establishments_found = [] start = 1 count = 1000 more = True while more: establishment_list = scrapertools.get_content(BASE_URL + city['establishmentUrl']+'&start='+str(start)+'&count='+str(count)) if establishment_list.find(text='No documents found') is not None: more = False continue start += count establishments = establishment_list.find_all('tr') for establishment in establishments: details = establishment.find_all('td') if len(details) == 4 and details[0] is not None and details[0].a is not None: date = (None if scrapertools.get_text(details[3]) is None else datetime.strptime(scrapertools.get_text(details[3]), '%d-%b-%Y')) establishments_found.append({ 'name': scrapertools.get_text(details[0]), 'url': details[0].a['href'], 'address': scrapertools.get_text(details[2]), 'locality': city['locality'], 'last_inspection_date': date }) return establishments_found
def get_inspections(establishment, city_url): inspections_found = [] establishment_details = scrapertools.get_content(BASE_URL + establishment['url']) inspections = establishment_details.find_all(text='Inspection Type')[0].find_parent('tr').find_all_next('tr') for inspection in inspections: details = inspection.find_all('td') if details[0].a is None: continue violations = get_violations(BASE_URL + city_url + '/' + details[0].a['href']) inspections_found.append({ 'type': scrapertools.get_text(details[0]), 'date': datetime.strptime(scrapertools.get_text(details[1]), '%d-%b-%Y'), 'violations': violations }) return inspections_found