def get_municipality(dolt, state, fips, city): # data = read_pandas_sql(dolt, "SELECT * FROM datasets WHERE url = 'https://cityprotect.com/agency/540048e6-ee66-4a6f-88ae-0ceb93717e50'") data = read_pandas_sql(dolt, "SELECT * FROM `municipalities`where state_iso = '{}' and county_fips = '{}' and name = '{}'".format(state, fips, city)) # check if a result was passed if data.shape[0] == 0: print(" [X] No Matching Municipalities") return None if data.shape[0] == 1: print(" [!] Found Muncipality: ID #{}!".format(data.loc[0, 'id'])) return data.loc[0, 'id']
def get_dataset(dolt, dataset_url, agency): # data = read_pandas_sql(dolt, "SELECT * FROM datasets WHERE url = 'https://cityprotect.com/agency/540048e6-ee66-4a6f-88ae-0ceb93717e50'") data = read_pandas_sql(dolt, "SELECT * FROM datasets WHERE url = '{}'".format(dataset_url)) # check if a result was passed if data.shape[0] == 0: print(" [X] No Dataset Found! Proceeding to Add New Dataset...") return new_dataset(dolt, agency, dataset_url) if data.shape[0] == 1: print(" [!] Found Existing Dataset Record: ID #{}!".format(data.loc[0, 'id'])) return data
def get_agency_id(dolt, name, state): try: data = read_pandas_sql( dolt, "SELECT * FROM 'agencies' where soundex('name') = soundex('{}') and state_iso = '{}'" .format(name.strip(), state.strip())) # check if a result was passed if data.shape[0] == 0: print(" [X] No Agency Found!") return '' if data.shape[0] == 1: print(" [!] Found Agency ID #{}!".format(data.loc[0, 'id'])) return data.loc[0, 'id'] except: print(" [X] Error Fetching Agency") return ''
def new_dataset(dolt, agency, url): print(' [*] Adding a New Dataset:') name = agency['name'] print(' [*] name: {}'.format(name)) print(' [*] url: {}'.format(url)) # it appears most will have 'County' in the name # else make it municipal if 'County' in name: aggregation_level = 'county' else: aggregation_level = 'municipal' print(' [*] aggregation level: {}'.format(aggregation_level)) source_type_id = 3 # Third Party print(' [*] source type: {}'.format('Third Party')) data_types_id = 10 # Incident Reports print(' [*] data type: {}'.format('Incident Reports')) format_types_id = 2 # CityProtect print(' [*] format type: {}'.format('CityProtect')) state = agency['state'] print(' [*] state: {}'.format(state)) # use lat and long to retrieve county fips from FCC.gov lat = agency['center']['coordinates'][1] lng = agency['center']['coordinates'][0] fcc = "https://geo.fcc.gov/api/census/area?lat={}&lon={}&format=json".format(lat, lng) print(" [!] Fetching County FIPS code from FCC.gov") response = requests.get(fcc) # print(response.text) json_resp = json.loads(response.text) fips = json_resp['results'][0]['county_fips'] print(" [*] fips: {}".format(fips)) if aggregation_level == 'municipal': print(" [!] Searching municipalities table for id") city_id = get_municipality(dolt, state, fips, agency['city']) else: city_id = None consolidator = 'CityProtect' print(' [*] consolidator: {}'.format(consolidator)) update_freq = 'quarterly' print(' [*] update freq: {}'.format(update_freq)) portal = 'CityProtect' print(' [*] portal type: {}'.format(portal)) start = agency['reports'][0]['targetPeriodStart'] or None print(' [*] start date: {}'.format(start)) # technically we could just omit these, but leaving them here in case this code # is reused elsewhere so they aren't forgotten scraper_path = None notes = None # then grab all our vars and turn into a dataframe: data = pd.DataFrame([{ 'url': url, 'name': name, 'aggregation_level': aggregation_level, 'source_type_id': source_type_id, 'data_types_id': data_types_id, 'format_types_id': format_types_id, 'state_iso': state, 'county_fips': fips, 'city_id' : city_id, 'consolidator':consolidator, 'update_frequency':update_freq, 'portal_type': portal, 'coverage_start': start, 'scraper_path': scraper_path, 'notes': notes }]) print(" [*] Inserting data to datasets table...") id = uuid.uuid4() insert = dolt.sql("INSERT into datasets (`id`, `url`, `name`, `aggregation_level`, `source_type_id`, `data_types_id`, `format_types_id`, `state_iso`, `county_fips`, `city_id`, `consolidator`, `portal_type`, `coverage_start`, `scraper_path`, `notes`) VALUES ('{}', '{}', '{}', '{}', '{}', '{}', '{}', '{}', '{}', '{}', '{}', '{}', '{}', '{}', '{}');".format(id, url, name, aggregation_level, source_type_id, data_types_id, format_types_id, state, fips, city_id, consolidator, portal, start, scraper_path, notes), result_format="csv") # and grab the record data = read_pandas_sql(dolt, "select * from datasets where id = '{}'".format(id)) print(" [!] Inserted Dataset Record: ID #{}!".format(data.loc[0, 'id'])) return data
def new_dataset(dolt, agency, url): print(' [*] Adding a New Dataset:') name = agency['name'].replace("'", '') print(' [*] name: {}'.format(name)) print(' [*] url: {}'.format(url)) source_type_id = 3 # Third Party print(' [*] source type: {}'.format('Third Party')) data_types_id = 10 # Incident Reports print(' [*] data type: {}'.format('Incident Reports')) format_types_id = 2 # CityProtect print(' [*] format type: {}'.format('CityProtect')) # try to use soundex to find the agency ID. will not always work agency_id = get_agency_id(dolt, name, agency['state']) print(' [*] agency id: {}'.format(agency_id)) ''' fcc = "https://geo.fcc.gov/api/census/area?lat={}&lon={}&format=json".format(lat, lng) print(" [!] Fetching County FIPS code from FCC.gov") response = requests.get(fcc) # print(response.text) json_resp = json.loads(response.text) fips = json_resp['results'][0]['county_fips'] print(" [*] fips: {}".format(fips)) ''' update_freq = 3 # quarterly print(' [*] update freq: {}'.format(update_freq)) portal = 'CityProtect' print(' [*] portal type: {}'.format(portal)) start = agency['reports'][0]['targetPeriodStart'] or None print(' [*] start date: {}'.format(start)) # technically we could just omit these, but leaving them here in case this code # is reused elsewhere so they aren't forgotten scraper_id = '' notes = '' # then grab all our vars and turn into a dataframe: data = pd.DataFrame([{ 'url': url, 'name': name, 'source_type_id': source_type_id, 'data_types_id': data_types_id, 'format_types_id': format_types_id, 'agency_id': agency_id, 'update_frequency': update_freq, 'portal_type': portal, 'coverage_start': start, 'scraper_id': scraper_id, 'notes': notes }]) print(" [*] Inserting data to datasets table...") id = str(uuid.uuid4()).replace('-', '') # UUID without dashes insert = dolt.sql( "INSERT into datasets ('id', 'url', 'name', 'source_type_id', 'data_types_id', 'format_types_id', 'agency_id', 'update_frequency', 'portal_type', 'coverage_start', 'scraper_id', 'notes') VALUES ('{}', '{}', '{}', '{}', '{}', '{}', '{}', '{}', '{}', '{}', '{}', '{}', '{}', '{}', '{}');" .format(id, url, name, source_type_id, data_types_id, format_types_id, agency_id, update_freq, portal, start, scraper_id, notes), result_format="csv") # and grab the record data = read_pandas_sql(dolt, "select * from datasets where id = '{}'".format(id)) print(" [!] Inserted Dataset Record: ID #{}!".format(data.loc[0, 'id'])) return data