def find_records(soup, community, agency, county, url): global date_range records = soup.find_all("tr", {"class": "EventSearchGridRow"}) v = soup.find("input", {"id": "__VIEWSTATE"})["value"] e = soup.find("input", {"id": "__EVENTVALIDATION"})["value"] v_e = {"__VIEWSTATE": v, "__EVENTVALIDATION": e} for record in records: # new_record = [community, '', '', '', '', '', '', '', '', '', '', '', '', ''] other_data = {"scrape_type": "search", "id_generate": "0"} data = {} id_and_type = {} record_fields = record.find_all("td") id_and_type["record_type"] = record_fields[2].string.strip() # record type data["occurred_date"] = date_formatters.format_db_datetime(record_fields[1].string.strip()) # date data["address"] = re.sub(r" +", " ", record_fields[4].string.strip()) if re.search("[A-Za-z]+", data["address"]) is not None: data["address"] = data["address"] + county_st if id_and_type["record_type"] == "Incident": data["reported_date"] = date_formatters.format_db_date( date_range["MasterPage$mainContent$txtDateFrom$txtDatePicker"] ) data["date_reported"] = data["reported_date"] data["time_reported"] = "" data["on_date"] = data["occurred_date"] else: data["date_occurred"] = date_formatters.format_db_date_part(record_fields[1].string.strip()) data["time_occurred"] = date_formatters.format_db_time_part(record_fields[1].string.strip()) if id_and_type["record_type"] != "Accident": data["charge"] = remove_semicolon( record_fields[3].find_all("strong")[1].next_sibling.strip() ) # offense text else: data["charge"] = "" if id_and_type["record_type"] == "Arrest": data["name"] = record_fields[3].find_all("strong")[0].next_sibling.strip() # arrestee id_and_type["record_id"] = hashlib.sha224( data["name"] + data["occurred_date"] + data["address"] + data["charge"] ).hexdigest() other_data["id_generate"] = "1" else: if len(record_fields[3].find_all("strong")) == 0: id_and_type["record_id"] = hashlib.sha224(data["occurred_date"] + data["address"]).hexdigest() else: id_and_type["record_id"] = record_fields[3].find_all("strong")[0].next_sibling.strip() # case number # this is to download the pdf. not sure if we want to try that now. has_gif = record_fields[5].find("a").find("div") if has_gif is None: # there's no pdf # return '' data["pdf"] = "" else: data["pdf"] = dl_pdf( record_fields[5].find("a")["href"].strip().split("'")[1], id_and_type, agency, v_e, url ) # pdf stuff data = dict( data.items() + other_data.items() + id_and_type.items() + {"agency": agency, "county": county}.items() ) scraper_commands.all_data[id_and_type["record_type"]].append(scraper_commands.check_data(data))
def find_records(soup, community, agency, county, url): global date_range records = soup.find_all('tr', {"class":"EventSearchGridRow"}) v = soup.find('input', {'id': "__VIEWSTATE"})['value'] e = soup.find('input', {'id': "__EVENTVALIDATION"})['value'] v_e = {'__VIEWSTATE': v, '__EVENTVALIDATION': e} for record in records: other_data = {'scrape_type': 'search', 'id_generate': '0'} data = {} id_and_type = {} record_fields = record.find_all('td') id_and_type['record_type'] = record_fields[2].string.strip() # record type data['occurred_date'] = format_db_datetime(record_fields[1].string.strip()) # date data['address'] = re.sub(r' +', ' ', record_fields[4].string.strip()) if re.search('[A-Za-z]+',data['address']) is not None: data['address'] = data['address'] + county_st if id_and_type['record_type'] == 'Incident': data['reported_date'] = format_db_date(date_range['MasterPage$mainContent$txtDateFrom2']) data['date_reported'] = data['reported_date'] data['time_reported'] = '' data['on_date'] = data['occurred_date'] else: data['date_occurred'] = format_db_date_part(record_fields[1].string.strip()) data['time_occurred'] = format_db_time_part(record_fields[1].string.strip()) if id_and_type['record_type'] != 'Accident': if len(record_fields) < 4 or len(record_fields[3].find_all('strong')) < 2: print record continue data['charge'] = remove_semicolon( record_fields[3].find_all('strong')[1].next_sibling.strip() ) # offense text else: data['charge'] = '' if id_and_type['record_type'] == 'Arrest': data['name'] = record_fields[3].find_all('strong')[0].next_sibling.strip() # arrestee id_and_type['record_id'] = hashlib.sha224(data['name'] + data['occurred_date'] + data['address'] + data['charge']).hexdigest() other_data['id_generate'] = '1' else: if len(record_fields[3].find_all('strong')) == 0: id_and_type['record_id'] = hashlib.sha224(data['occurred_date'] + data['address']).hexdigest() other_data['id_generate'] = '1' else: id_and_type['record_id'] = record_fields[3].find_all('strong')[0].next_sibling.strip() # case number #this is to download the pdf. not sure if we want to try that now. has_gif = record_fields[5].find('a').find('div') if has_gif is None: #there's no pdf # return '' data['pdf'] = '' else: data['pdf'] = dl_pdf(record_fields[5].find('a')['href'].strip().split("'")[1], id_and_type, agency, v_e, url) # pdf stuff data = dict(data.items() + other_data.items() + id_and_type.items() + {'agency': agency, 'county': county}.items()) all_data[id_and_type['record_type']].append(check_data(data))
def find_records(soup, community, agency, county, url): global date_range records = soup.find_all('tr', {'class':'EventSearchGridRow'}) v = soup.find('input', {'id': "__VIEWSTATE"})['value'] e = soup.find('input', {'id': "__EVENTVALIDATION"})['value'] v_e = {'__VIEWSTATE': v, '__EVENTVALIDATION': e} for record in records: # new_record = [community, '', '', '', '', '', '', '', '', '', '', '', '', ''] other_data = {'scrape_type': 'search', 'id_generate': '0'} data = {} id_and_type = {} record_fields = record.find_all('td') id_and_type['record_type'] = record_fields[2].string.strip() # record type data['occurred_date'] = date_formatters.format_db_datetime(record_fields[1].string.strip()) # date data['address'] = re.sub(r' +', ' ', record_fields[4].string.strip()) if re.search('[A-Za-z]+',data['address']) is not None: data['address'] = data['address'] + county_st if id_and_type['record_type'] == 'Incident': data['reported_date'] = date_formatters.format_db_date(date_range['MasterPage$mainContent$txtDateFrom$txtDatePicker']) data['date_reported'] = data['reported_date'] data['time_reported'] = '' data['on_date'] = data['occurred_date'] else: data['date_occurred'] = date_formatters.format_db_date_part(record_fields[1].string.strip()) data['time_occurred'] = date_formatters.format_db_time_part(record_fields[1].string.strip()) if id_and_type['record_type'] != 'Accident': data['charge'] = remove_semicolon( record_fields[3].find_all('strong')[1].next_sibling.strip() ) # offense text else: data['charge'] = '' if id_and_type['record_type'] == 'Arrest': data['name'] = record_fields[3].find_all('strong')[0].next_sibling.strip() # arrestee id_and_type['record_id'] = hashlib.sha224(data['name'] + data['occurred_date'] + data['address'] + data['charge']).hexdigest() other_data['id_generate'] = '1' else: if len(record_fields[3].find_all('strong')) == 0: id_and_type['record_id'] = hashlib.sha224(data['occurred_date'] + data['address']).hexdigest() else: id_and_type['record_id'] = record_fields[3].find_all('strong')[0].next_sibling.strip() # case number #this is to download the pdf. not sure if we want to try that now. has_gif = record_fields[5].find('a').find('div') if has_gif is None: #there's no pdf #return '' data['pdf'] = '' else: data['pdf'] = dl_pdf(record_fields[5].find('a')['href'].strip().split("'")[1], id_and_type, agency, v_e, url) # pdf stuff data = dict(data.items() + other_data.items() + id_and_type.items() + {'agency': agency, 'county': county}.items()) scraper_commands.all_data[id_and_type['record_type']].append(scraper_commands.check_data(data))
def parse_details(piece, id_and_type, officer): if id_and_type['record_type'] == 'Incident': data = parse_incident(piece, id_and_type, officer) elif id_and_type['record_type'] == 'Arrest': data = parse_arrest(piece, id_and_type, officer) elif id_and_type['record_type'] == 'Citation': data = parse_citation(piece, id_and_type, officer) else: data = parse_accident(piece, id_and_type, officer) #data contains a dict with the items we pulled and formatted #we append that to the record_type array in all data #build a single all_data to print later if data is None: return return all_data[id_and_type['record_type']].append(check_data(data))