Example #1
0
def main():
    #fetch data from our google spreadsheet that tells us what to scrape
    home_dir, data_dir, database, db_user, db_pw, commands_url = make_config(
        '_nh')
    site = {
        'URL': 'http://p2c.nhcgov.com/p2c/Summary.aspx',
        'Agency': "New Hanover County Sheriff's Office",
        'County': 'New Hanover',
        'How far back': '7'
    }
    #variables we'll use in our scraping and data format
    county = site['County']
    url = site['URL']
    agency = site['Agency']
    #this is how many days back we want to scrape
    #e.g. 1 would scrape a total of 2 days:
    # today plus 1 day back (yesterday)
    howfar = int(site['How far back'])
    #try for daily bulletin
    bulletin_url = try_bulletin(url)
    start_scrape(agency, county, bulletin_url, howfar)
    #output data as tab-delimited text files named for the
    #record type (arrest.txt, incident.txt, citation.txt, accident.txt)
    print_files(scraper_commands.all_data, data_dir)
    for data_type in all_data:
        data_file = data_dir + '/' + data_type + '.txt'
        table = data_type.lower() + 's'
        load(database, data_file, table, db_user, db_pw)
def find_min_date(agency):
    home_dir, data_dir, database, db_user, db_pw, commands_url = make_config()
    data_tables = ['accidents', 'arrests', 'citations', 'incidents']
    connection = MySQLdb.connect(user=db_user, passwd=db_pw, db=database)
    cursor = connection.cursor()
    sql = 'select min(date_reported) from incidents where agency = "' + agency + '" and date_reported > "00-00-00"'
    cursor.execute(sql)
    min_date = cursor.fetchone()[0]
    if not min_date:
        sql = 'select min(date_occurred) from arrests where agency = "' + agency + '" and date_occurred > "00-00-00"'
        cursor.execute(sql)
        min_date = cursor.fetchone()[0]
    if agency == 'Huntersville Police Department':
        sql = 'select min(date_occurred) from citations where agency = "' + agency + '" and date_occurred > "00-00-00"'
        cursor.execute(sql)
        min_date = cursor.fetchone()[0]

    cursor.close()
    return min_date
def find_min_date(agency):
    home_dir, data_dir, database, db_user, db_pw, commands_url = make_config()
    data_tables = ['accidents','arrests','citations','incidents']
    connection = MySQLdb.connect(user=db_user, passwd=db_pw, db=database)
    cursor = connection.cursor()
    sql = 'select min(date_reported) from incidents where agency = "' + agency + '" and date_reported > "00-00-00"'
    cursor.execute(sql)
    min_date = cursor.fetchone()[0]
    if not min_date:
        sql = 'select min(date_occurred) from arrests where agency = "' + agency + '" and date_occurred > "00-00-00"'
        cursor.execute(sql)
        min_date = cursor.fetchone()[0]
    if agency == 'Huntersville Police Department':
        sql = 'select min(date_occurred) from citations where agency = "' + agency + '" and date_occurred > "00-00-00"'
        cursor.execute(sql)
        min_date = cursor.fetchone()[0]

    cursor.close()
    return min_date
Example #4
0
def main():
    #fetch data from our google spreadsheet that tells us what to scrape
    home_dir, data_dir, database, db_user, db_pw, commands_url = make_config('_nh')
    site = {'URL': 'http://p2c.nhcgov.com/p2c/Summary.aspx','Agency':"New Hanover County Sheriff's Office",'County': 'New Hanover','How far back':'7'}
    #variables we'll use in our scraping and data format
    county = site['County']
    url = site['URL']
    agency = site['Agency']
    #this is how many days back we want to scrape
    #e.g. 1 would scrape a total of 2 days:
    # today plus 1 day back (yesterday)
    howfar = int(site['How far back'])
    #try for daily bulletin
    bulletin_url = try_bulletin(url)
    start_scrape(agency, county, bulletin_url, howfar)
    #output data as tab-delimited text files named for the
    #record type (arrest.txt, incident.txt, citation.txt, accident.txt)
    print_files(scraper_commands.all_data,data_dir)
    for data_type in all_data:
        data_file = data_dir + '/' + data_type + '.txt'
        table = data_type.lower() + 's'
        load(database,data_file, table, db_user, db_pw)
Example #5
0
def main():
    home_dir, data_dir, database, db_user, db_pw, commands_url = make_config()
    sites_to_scrape = fetch_commands(commands_url)
    #pick out site we want as index from list of sites
    #passed as an argument to this script
    site = sites_to_scrape[int(sys.argv[1])]
    #variables we'll use in our scraping and data format
    county = site['County']
    url = site['URL']
    agency = site['Agency']
    #this is how many days back we want to scrape
    #e.g. 1 would scrape a total of 2 days:
    # today plus 1 day back (yesterday)
    howfar = int(site['How far back'])
    #try for daily bulletin
    #if not, then go for search
    bulletin_url = scrape_bulletin.try_bulletin(url)
    if bulletin_url:
        if bulletin_url == 'unreachable':
            print "\t".join([url,bulletin_url])
        else:
            data = scrape_bulletin.start_scrape(agency, county, bulletin_url, howfar)
    else:
        #we'll need to import the functionality to scrape a search site
        import scrape_search
        data = scrape_search.start_scrape(agency, url, howfar, county)
        if not data:
            print "\t".join([url,"failed"])
    #output data as tab-delimited text files named for the
    #record type (arrest.txt, incident.txt, citation.txt, accident.txt)
    print_files(all_data,data_dir, site['Site'])
    exit()
    for data_type in all_data:
        data_file = data_dir + '/' + site['Site'] + data_type + '.txt'
        table = data_type.lower() + 's'
        db_load(database,data_file, table, db_user, db_pw)
Example #6
0
#!/usr/bin/env python
#pull data from fayetteville, nc, pd's p2c site
#this is custom because that site is so different from others
import scrape_bulletin
import scrape_search_fay
from scraper_commands import check_data, print_files, fetch_commands, all_data
from db_load import db_load

from scraper_config import make_config

home_dir, data_dir, database, db_user, db_pw, commands_url = make_config()

data_dir = home_dir + 'data_fay'
database = 'crime'
user = {'user': '******','pw':'redaolemirc'}

def main():
    #fetch data from our google spreadsheet that tells us what to scrape
    sites_to_scrape = [{'URL': 'http://p2c.bethebadge.com/p2c/Summary.aspx','Agency':'Fayetteville Police Department','County': 'Cumberland','How far back':'7'}]
    for site in sites_to_scrape:
        #variables we'll use in our scraping and data format
        county = site['County']
        url = site['URL']
        agency = site['Agency']
        #this is how many days back we want to scrape
        #e.g. 1 would scrape a total of 2 days:
        # today plus 1 day back (yesterday)
        howfar = int(site['How far back'])
        #try for daily bulletin
        #if not, then go for search
        bulletin_url = scrape_bulletin.try_bulletin(url)
Example #7
0
#!/usr/bin/env python
#pull data from fayetteville, nc, pd's p2c site
#this is custom because that site is so different from others
import scrape_bulletin
import scrape_search_fay
from scraper_commands import check_data, print_files, fetch_commands, all_data
from db_load import db_load

from scraper_config import make_config

home_dir, data_dir, database, db_user, db_pw, commands_url = make_config()

data_dir = home_dir + 'data_fay'
database = 'crime'
user = {'user': '******', 'pw': 'redaolemirc'}


def main():
    #fetch data from our google spreadsheet that tells us what to scrape
    sites_to_scrape = [{
        'URL': 'http://p2c.bethebadge.com/p2c/Summary.aspx',
        'Agency': 'Fayetteville Police Department',
        'County': 'Cumberland',
        'How far back': '7'
    }]
    for site in sites_to_scrape:
        #variables we'll use in our scraping and data format
        county = site['County']
        url = site['URL']
        agency = site['Agency']
        #this is how many days back we want to scrape
#!/usr/bin/env python

#script to attempt to try to attach lat/lon coords to data using google's geocode api
#ungeocoded data has zeros for these fields. 
#we use -1 for those where the geocoder failed for some reason so we don't try those again

import requests
import json
import re
import time
import MySQLdb

from scraper_config import make_config

#we only need db, user and pw to connect to the database
home_dir, data_dir, db, user, pw, commands_url = make_config('')

#these are values returned by the geocode api. any result that isn't among these types/location_types
#is considered a failure
acceptable_types = ['RANGE_INTERPOLATED', 'ROOFTOP', 'GEOMETRIC_CENTER']
acceptable_location_types =['bus_station', 'transit_station', 'establishment','intersection','street_number','parking','establishment']

url = 'https://maps.googleapis.com/maps/api/geocode/json'

#a value we add to the record to indicate the source of the lat/lon
geocoder = 'Google'

#we'll geocode data from tables in this list
data_tables = ['arrests','incidents']

#the database connection we'll use
#!/usr/bin/env python

#script to attempt to try to attach lat/lon coords to data using google's geocode api
#ungeocoded data has zeros for these fields.
#we use -1 for those where the geocoder failed for some reason so we don't try those again

import requests
import json
import re
import time
import MySQLdb

from scraper_config import make_config

#we only need db, user and pw to connect to the database
home_dir, data_dir, db, user, pw, commands_url = make_config('')

#these are values returned by the geocode api. any result that isn't among these types/location_types
#is considered a failure
acceptable_types = ['RANGE_INTERPOLATED', 'ROOFTOP', 'GEOMETRIC_CENTER']
acceptable_location_types = [
    'bus_station', 'transit_station', 'establishment', 'intersection',
    'street_number', 'parking', 'establishment'
]

url = 'https://maps.googleapis.com/maps/api/geocode/json'

#a value we add to the record to indicate the source of the lat/lon
geocoder = 'Google'

#we'll geocode data from tables in this list