Python check_dataの例、scraper_commands.check_data Pythonの例

コード例 #1

0

ファイルを表示

ファイル: scrape_search_fay.py プロジェクト: mtdukes/data-dashboard

def find_records(soup, community, agency, county, url):
    global date_range
    records = soup.find_all("tr", {"class": "EventSearchGridRow"})
    v = soup.find("input", {"id": "__VIEWSTATE"})["value"]
    e = soup.find("input", {"id": "__EVENTVALIDATION"})["value"]
    v_e = {"__VIEWSTATE": v, "__EVENTVALIDATION": e}

    for record in records:
        #        new_record = [community, '', '', '', '', '', '', '', '', '', '', '', '', '']
        other_data = {"scrape_type": "search", "id_generate": "0"}
        data = {}
        id_and_type = {}
        record_fields = record.find_all("td")
        id_and_type["record_type"] = record_fields[2].string.strip()  # record type
        data["occurred_date"] = date_formatters.format_db_datetime(record_fields[1].string.strip())  # date
        data["address"] = re.sub(r" +", " ", record_fields[4].string.strip())
        if re.search("[A-Za-z]+", data["address"]) is not None:
            data["address"] = data["address"] + county_st
        if id_and_type["record_type"] == "Incident":
            data["reported_date"] = date_formatters.format_db_date(
                date_range["MasterPage$mainContent$txtDateFrom$txtDatePicker"]
            )
            data["date_reported"] = data["reported_date"]
            data["time_reported"] = ""
            data["on_date"] = data["occurred_date"]
        else:
            data["date_occurred"] = date_formatters.format_db_date_part(record_fields[1].string.strip())
            data["time_occurred"] = date_formatters.format_db_time_part(record_fields[1].string.strip())
        if id_and_type["record_type"] != "Accident":
            data["charge"] = remove_semicolon(
                record_fields[3].find_all("strong")[1].next_sibling.strip()
            )  # offense text
        else:
            data["charge"] = ""
        if id_and_type["record_type"] == "Arrest":
            data["name"] = record_fields[3].find_all("strong")[0].next_sibling.strip()  # arrestee
            id_and_type["record_id"] = hashlib.sha224(
                data["name"] + data["occurred_date"] + data["address"] + data["charge"]
            ).hexdigest()
            other_data["id_generate"] = "1"
        else:
            if len(record_fields[3].find_all("strong")) == 0:
                id_and_type["record_id"] = hashlib.sha224(data["occurred_date"] + data["address"]).hexdigest()
            else:
                id_and_type["record_id"] = record_fields[3].find_all("strong")[0].next_sibling.strip()  # case number
        # this is to download the pdf. not sure if we want to try that now.
        has_gif = record_fields[5].find("a").find("div")
        if has_gif is None:
            # there's no pdf
            # return ''
            data["pdf"] = ""
        else:
            data["pdf"] = dl_pdf(
                record_fields[5].find("a")["href"].strip().split("'")[1], id_and_type, agency, v_e, url
            )  # pdf stuff

        data = dict(
            data.items() + other_data.items() + id_and_type.items() + {"agency": agency, "county": county}.items()
        )
        scraper_commands.all_data[id_and_type["record_type"]].append(scraper_commands.check_data(data))

コード例 #2

0

ファイルを表示

ファイル: scrape_search.py プロジェクト: OpenData-NC/data-dashboard

def find_records(soup, community, agency, county, url):
    global date_range
    records = soup.find_all('tr', {"class":"EventSearchGridRow"})
    v = soup.find('input', {'id': "__VIEWSTATE"})['value']
    e = soup.find('input', {'id': "__EVENTVALIDATION"})['value']
    v_e = {'__VIEWSTATE': v, '__EVENTVALIDATION': e}

    for record in records:
        other_data = {'scrape_type': 'search', 'id_generate': '0'}
        data = {}
        id_and_type = {}
        record_fields = record.find_all('td')
        id_and_type['record_type'] = record_fields[2].string.strip()  # record type
        data['occurred_date'] = format_db_datetime(record_fields[1].string.strip())  # date
        data['address'] = re.sub(r' +', ' ', record_fields[4].string.strip())
        if re.search('[A-Za-z]+',data['address']) is not None:
            data['address'] = data['address'] + county_st
        if id_and_type['record_type'] == 'Incident':
            data['reported_date'] = format_db_date(date_range['MasterPage$mainContent$txtDateFrom2'])
            data['date_reported'] = data['reported_date']
            data['time_reported'] = ''
            data['on_date'] = data['occurred_date']
        else:
            data['date_occurred'] = format_db_date_part(record_fields[1].string.strip())
            data['time_occurred'] = format_db_time_part(record_fields[1].string.strip()) 
        if id_and_type['record_type'] != 'Accident':
            if len(record_fields) < 4 or len(record_fields[3].find_all('strong')) < 2:
                print record
                continue
            data['charge'] = remove_semicolon(
                record_fields[3].find_all('strong')[1].next_sibling.strip()
            )  # offense text
        else:
            data['charge'] = ''
        if id_and_type['record_type'] == 'Arrest':
            data['name'] = record_fields[3].find_all('strong')[0].next_sibling.strip()  # arrestee
            id_and_type['record_id'] = hashlib.sha224(data['name'] + data['occurred_date'] + data['address']
                 + data['charge']).hexdigest()
            other_data['id_generate'] = '1'
        else:
            if len(record_fields[3].find_all('strong')) == 0:
                id_and_type['record_id'] = hashlib.sha224(data['occurred_date'] + data['address']).hexdigest()
                other_data['id_generate'] = '1'
            else:
                id_and_type['record_id'] = record_fields[3].find_all('strong')[0].next_sibling.strip()  # case number
#this is to download the pdf. not sure if we want to try that now.
        has_gif = record_fields[5].find('a').find('div')
        if has_gif is None:
            #there's no pdf
#            return ''
            data['pdf'] = ''
        else:
            data['pdf'] = dl_pdf(record_fields[5].find('a')['href'].strip().split("'")[1], id_and_type,
                                    agency, v_e, url)  # pdf stuff

        data = dict(data.items() + other_data.items() + id_and_type.items() + {'agency': agency, 'county': county}.items())
        all_data[id_and_type['record_type']].append(check_data(data))

コード例 #3

0

ファイルを表示

def find_records(soup, community, agency, county, url):
    global date_range
    records = soup.find_all('tr', {'class':'EventSearchGridRow'})
    v = soup.find('input', {'id': "__VIEWSTATE"})['value']
    e = soup.find('input', {'id': "__EVENTVALIDATION"})['value']
    v_e = {'__VIEWSTATE': v, '__EVENTVALIDATION': e}

    for record in records:
#        new_record = [community, '', '', '', '', '', '', '', '', '', '', '', '', '']
        other_data = {'scrape_type': 'search', 'id_generate': '0'}
        data = {}
        id_and_type = {}
        record_fields = record.find_all('td')
        id_and_type['record_type'] = record_fields[2].string.strip()  # record type
        data['occurred_date'] = date_formatters.format_db_datetime(record_fields[1].string.strip())  # date
        data['address'] = re.sub(r' +', ' ', record_fields[4].string.strip())
        if re.search('[A-Za-z]+',data['address']) is not None:
            data['address'] = data['address'] + county_st
        if id_and_type['record_type'] == 'Incident':
            data['reported_date'] = date_formatters.format_db_date(date_range['MasterPage$mainContent$txtDateFrom$txtDatePicker'])
            data['date_reported'] = data['reported_date']
            data['time_reported'] = ''
            data['on_date'] = data['occurred_date']
        else:
            data['date_occurred'] = date_formatters.format_db_date_part(record_fields[1].string.strip())
            data['time_occurred'] = date_formatters.format_db_time_part(record_fields[1].string.strip()) 
        if id_and_type['record_type'] != 'Accident':
            data['charge'] = remove_semicolon(
                record_fields[3].find_all('strong')[1].next_sibling.strip()
            )  # offense text
        else:
            data['charge'] = ''
        if id_and_type['record_type'] == 'Arrest':
            data['name'] = record_fields[3].find_all('strong')[0].next_sibling.strip()  # arrestee
            id_and_type['record_id'] = hashlib.sha224(data['name'] + data['occurred_date'] + data['address']
                 + data['charge']).hexdigest()
            other_data['id_generate'] = '1'
        else:
            if len(record_fields[3].find_all('strong')) == 0:
                id_and_type['record_id'] = hashlib.sha224(data['occurred_date'] + data['address']).hexdigest()
            else:
                id_and_type['record_id'] = record_fields[3].find_all('strong')[0].next_sibling.strip()  # case number
#this is to download the pdf. not sure if we want to try that now.
        has_gif = record_fields[5].find('a').find('div')
        if has_gif is None:
            #there's no pdf
            #return ''
            data['pdf'] = ''
        else:
            data['pdf'] = dl_pdf(record_fields[5].find('a')['href'].strip().split("'")[1], id_and_type,
                                    agency, v_e, url)  # pdf stuff

        data = dict(data.items() + other_data.items() + id_and_type.items() + {'agency': agency, 'county': county}.items())
        scraper_commands.all_data[id_and_type['record_type']].append(scraper_commands.check_data(data))

コード例 #4

0

ファイルを表示

ファイル: scrape_bulletin.py プロジェクト: OpenData-NC/data-dashboard

def parse_details(piece, id_and_type, officer):
    if id_and_type['record_type'] == 'Incident':
        data = parse_incident(piece, id_and_type, officer)
    elif id_and_type['record_type'] == 'Arrest':
        data = parse_arrest(piece, id_and_type, officer)
    elif id_and_type['record_type'] == 'Citation':
        data = parse_citation(piece, id_and_type, officer)
    else:
        data = parse_accident(piece, id_and_type, officer)
    #data contains a dict with the items we pulled and formatted
    #we append that to the record_type array in all data
    #build a single all_data to print later
    if data is None:
        return
    return all_data[id_and_type['record_type']].append(check_data(data))

コード例 #5

0

ファイルを表示

ファイル: scrape_bulletin.py プロジェクト: OpenData-NC/data-dashboard

def parse_details(piece, id_and_type, officer):
    if id_and_type['record_type'] == 'Incident':
        data = parse_incident(piece, id_and_type, officer)
    elif id_and_type['record_type'] == 'Arrest':
        data = parse_arrest(piece, id_and_type, officer)
    elif id_and_type['record_type'] == 'Citation':
        data = parse_citation(piece, id_and_type, officer)
    else:
        data = parse_accident(piece, id_and_type, officer)
    #data contains a dict with the items we pulled and formatted
    #we append that to the record_type array in all data
    #build a single all_data to print later
    if data is None:
        return
    return all_data[id_and_type['record_type']].append(check_data(data))