def main(): #fetch data from our google spreadsheet that tells us what to scrape home_dir, data_dir, database, db_user, db_pw, commands_url = make_config( '_nh') site = { 'URL': 'http://p2c.nhcgov.com/p2c/Summary.aspx', 'Agency': "New Hanover County Sheriff's Office", 'County': 'New Hanover', 'How far back': '7' } #variables we'll use in our scraping and data format county = site['County'] url = site['URL'] agency = site['Agency'] #this is how many days back we want to scrape #e.g. 1 would scrape a total of 2 days: # today plus 1 day back (yesterday) howfar = int(site['How far back']) #try for daily bulletin bulletin_url = try_bulletin(url) start_scrape(agency, county, bulletin_url, howfar) #output data as tab-delimited text files named for the #record type (arrest.txt, incident.txt, citation.txt, accident.txt) print_files(scraper_commands.all_data, data_dir) for data_type in all_data: data_file = data_dir + '/' + data_type + '.txt' table = data_type.lower() + 's' load(database, data_file, table, db_user, db_pw)
def find_min_date(agency): home_dir, data_dir, database, db_user, db_pw, commands_url = make_config() data_tables = ['accidents', 'arrests', 'citations', 'incidents'] connection = MySQLdb.connect(user=db_user, passwd=db_pw, db=database) cursor = connection.cursor() sql = 'select min(date_reported) from incidents where agency = "' + agency + '" and date_reported > "00-00-00"' cursor.execute(sql) min_date = cursor.fetchone()[0] if not min_date: sql = 'select min(date_occurred) from arrests where agency = "' + agency + '" and date_occurred > "00-00-00"' cursor.execute(sql) min_date = cursor.fetchone()[0] if agency == 'Huntersville Police Department': sql = 'select min(date_occurred) from citations where agency = "' + agency + '" and date_occurred > "00-00-00"' cursor.execute(sql) min_date = cursor.fetchone()[0] cursor.close() return min_date
def find_min_date(agency): home_dir, data_dir, database, db_user, db_pw, commands_url = make_config() data_tables = ['accidents','arrests','citations','incidents'] connection = MySQLdb.connect(user=db_user, passwd=db_pw, db=database) cursor = connection.cursor() sql = 'select min(date_reported) from incidents where agency = "' + agency + '" and date_reported > "00-00-00"' cursor.execute(sql) min_date = cursor.fetchone()[0] if not min_date: sql = 'select min(date_occurred) from arrests where agency = "' + agency + '" and date_occurred > "00-00-00"' cursor.execute(sql) min_date = cursor.fetchone()[0] if agency == 'Huntersville Police Department': sql = 'select min(date_occurred) from citations where agency = "' + agency + '" and date_occurred > "00-00-00"' cursor.execute(sql) min_date = cursor.fetchone()[0] cursor.close() return min_date
def main(): #fetch data from our google spreadsheet that tells us what to scrape home_dir, data_dir, database, db_user, db_pw, commands_url = make_config('_nh') site = {'URL': 'http://p2c.nhcgov.com/p2c/Summary.aspx','Agency':"New Hanover County Sheriff's Office",'County': 'New Hanover','How far back':'7'} #variables we'll use in our scraping and data format county = site['County'] url = site['URL'] agency = site['Agency'] #this is how many days back we want to scrape #e.g. 1 would scrape a total of 2 days: # today plus 1 day back (yesterday) howfar = int(site['How far back']) #try for daily bulletin bulletin_url = try_bulletin(url) start_scrape(agency, county, bulletin_url, howfar) #output data as tab-delimited text files named for the #record type (arrest.txt, incident.txt, citation.txt, accident.txt) print_files(scraper_commands.all_data,data_dir) for data_type in all_data: data_file = data_dir + '/' + data_type + '.txt' table = data_type.lower() + 's' load(database,data_file, table, db_user, db_pw)
def main(): home_dir, data_dir, database, db_user, db_pw, commands_url = make_config() sites_to_scrape = fetch_commands(commands_url) #pick out site we want as index from list of sites #passed as an argument to this script site = sites_to_scrape[int(sys.argv[1])] #variables we'll use in our scraping and data format county = site['County'] url = site['URL'] agency = site['Agency'] #this is how many days back we want to scrape #e.g. 1 would scrape a total of 2 days: # today plus 1 day back (yesterday) howfar = int(site['How far back']) #try for daily bulletin #if not, then go for search bulletin_url = scrape_bulletin.try_bulletin(url) if bulletin_url: if bulletin_url == 'unreachable': print "\t".join([url,bulletin_url]) else: data = scrape_bulletin.start_scrape(agency, county, bulletin_url, howfar) else: #we'll need to import the functionality to scrape a search site import scrape_search data = scrape_search.start_scrape(agency, url, howfar, county) if not data: print "\t".join([url,"failed"]) #output data as tab-delimited text files named for the #record type (arrest.txt, incident.txt, citation.txt, accident.txt) print_files(all_data,data_dir, site['Site']) exit() for data_type in all_data: data_file = data_dir + '/' + site['Site'] + data_type + '.txt' table = data_type.lower() + 's' db_load(database,data_file, table, db_user, db_pw)
#!/usr/bin/env python #pull data from fayetteville, nc, pd's p2c site #this is custom because that site is so different from others import scrape_bulletin import scrape_search_fay from scraper_commands import check_data, print_files, fetch_commands, all_data from db_load import db_load from scraper_config import make_config home_dir, data_dir, database, db_user, db_pw, commands_url = make_config() data_dir = home_dir + 'data_fay' database = 'crime' user = {'user': '******','pw':'redaolemirc'} def main(): #fetch data from our google spreadsheet that tells us what to scrape sites_to_scrape = [{'URL': 'http://p2c.bethebadge.com/p2c/Summary.aspx','Agency':'Fayetteville Police Department','County': 'Cumberland','How far back':'7'}] for site in sites_to_scrape: #variables we'll use in our scraping and data format county = site['County'] url = site['URL'] agency = site['Agency'] #this is how many days back we want to scrape #e.g. 1 would scrape a total of 2 days: # today plus 1 day back (yesterday) howfar = int(site['How far back']) #try for daily bulletin #if not, then go for search bulletin_url = scrape_bulletin.try_bulletin(url)
#!/usr/bin/env python #pull data from fayetteville, nc, pd's p2c site #this is custom because that site is so different from others import scrape_bulletin import scrape_search_fay from scraper_commands import check_data, print_files, fetch_commands, all_data from db_load import db_load from scraper_config import make_config home_dir, data_dir, database, db_user, db_pw, commands_url = make_config() data_dir = home_dir + 'data_fay' database = 'crime' user = {'user': '******', 'pw': 'redaolemirc'} def main(): #fetch data from our google spreadsheet that tells us what to scrape sites_to_scrape = [{ 'URL': 'http://p2c.bethebadge.com/p2c/Summary.aspx', 'Agency': 'Fayetteville Police Department', 'County': 'Cumberland', 'How far back': '7' }] for site in sites_to_scrape: #variables we'll use in our scraping and data format county = site['County'] url = site['URL'] agency = site['Agency'] #this is how many days back we want to scrape
#!/usr/bin/env python #script to attempt to try to attach lat/lon coords to data using google's geocode api #ungeocoded data has zeros for these fields. #we use -1 for those where the geocoder failed for some reason so we don't try those again import requests import json import re import time import MySQLdb from scraper_config import make_config #we only need db, user and pw to connect to the database home_dir, data_dir, db, user, pw, commands_url = make_config('') #these are values returned by the geocode api. any result that isn't among these types/location_types #is considered a failure acceptable_types = ['RANGE_INTERPOLATED', 'ROOFTOP', 'GEOMETRIC_CENTER'] acceptable_location_types =['bus_station', 'transit_station', 'establishment','intersection','street_number','parking','establishment'] url = 'https://maps.googleapis.com/maps/api/geocode/json' #a value we add to the record to indicate the source of the lat/lon geocoder = 'Google' #we'll geocode data from tables in this list data_tables = ['arrests','incidents'] #the database connection we'll use
#!/usr/bin/env python #script to attempt to try to attach lat/lon coords to data using google's geocode api #ungeocoded data has zeros for these fields. #we use -1 for those where the geocoder failed for some reason so we don't try those again import requests import json import re import time import MySQLdb from scraper_config import make_config #we only need db, user and pw to connect to the database home_dir, data_dir, db, user, pw, commands_url = make_config('') #these are values returned by the geocode api. any result that isn't among these types/location_types #is considered a failure acceptable_types = ['RANGE_INTERPOLATED', 'ROOFTOP', 'GEOMETRIC_CENTER'] acceptable_location_types = [ 'bus_station', 'transit_station', 'establishment', 'intersection', 'street_number', 'parking', 'establishment' ] url = 'https://maps.googleapis.com/maps/api/geocode/json' #a value we add to the record to indicate the source of the lat/lon geocoder = 'Google' #we'll geocode data from tables in this list