def split_geo(df, add, city, state, zipcode, chunk_size=500): df_new = df.copy() df_new.reset_index(inplace=True) splits = np.ceil( df.shape[0]/chunk_size) chunk_li = np.array_split(df_new['index'], splits) res_li = [] pick_fi = [] for i,c in enumerate(chunk_li): # Grab data, export to csv sub_data = df_new.loc[c, ['index',add,city,state,zipcode]] sub_data.to_csv('temp_geo.csv',header=False,index=False) # Geo the results and turn back into df print(f'Geocoding round {int(i)+1} of {int(splits)}, {datetime.now()}') result = cg.addressbatch('temp_geo.csv') #should try/except? # May want to dump the intermediate results #pi_str = f'pickres_{int(i)}.p' #pickle.dump( favorite_color, open( pi_str, "wb" ) ) #pick_fi.append(pi_str.copy()) names = list(result[0].keys()) res_zl = [] for r in result: res_zl.append( list(r.values()) ) res_df = pd.DataFrame(res_zl, columns=names) res_li.append( res_df.copy() ) time.sleep(10) #sleep 10 seconds to not get cutoff from request final_df = pd.concat(res_li) final_df.rename(columns={'id':'row'}, inplace=True) final_df.reset_index(inplace=True, drop=True) # Clean up csv file os.remove('temp_geo.csv') return final_df
def callCensusBulkGeocoder(filename): print(datetime.datetime.now(), ' Geocoding addresses with Census Bulk') t5 = datetime.datetime.now() df = pd.read_csv(filename) # name, ext = os.path.splitext(filename) folder = r'\\fs-sea-1\Protection_Data_Files\Projects\19_022_NFIRS_EDA\IL\geocoded_results' parts = filename.split("\\") newpath = os.path.join(folder, parts[-1]) name, ext = os.path.splitext(newpath) try: result = cg.addressbatch(filename) df = pd.DataFrame(result) except Exception: print('Something went wrong') else: pass #df.to_csv(name + '_bulk' + ext, header=True, index=False, float_format="%.6f") print(datetime.datetime.now(), ' Finished geocoding with Census Bulk Geocoder') t6 = datetime.datetime.now() return t5, t6, filename, name, ext, df
def get_census_data(self, debug=False): #Create list to store return data self.receiveData = [] #Store list of people in firstname lastname format self.people_list = [i[0] + " " + i[1] for i in self.inputDataRaw] #Create and write to temporary sending csv file with open("tempSend.csv", "w", newline="\n") as file: csv_writer = csv.writer(file, dialect='excel') #Loop over the input data for index, row in enumerate(self.inputDataRaw): #Split address into streetname, city, state, zip code temp = row[3].split(",") #Create list in proper format for the census API toWrite = [ self.people_list[index], temp[0], temp[1], temp[2], temp[3] ] print(toWrite) #Write row csv_writer.writerow(toWrite) #Send API request self.receiveData = cg.addressbatch("tempSend.csv") if debug == True: for house in self.receiveData: print(house) print("Done!\n")
def _geocode_batch(df): df[['street_address', 'city', 'postal_abbreviation', 'code' ]].dropna().to_csv('/Users/travis.howe/Downloads/test_address2.csv', header=False, index=True) return pd.DataFrame( cg.addressbatch( '/Users/travis.howe/Downloads/test_address2.csv', returntype='geographies')) # I have to send in a .csv file
def geocode_wrapper(df_input_chunk): """ Runs the censusgeocode wrapper and merges the results to the df_input_chunk table """ print("calling censusgeocode addressbatch") print(df_input_chunk.iloc[0, :]) dict_output_chunk = cg.addressbatch(df_input_chunk[["street", "city", "state", "zip"]].to_dict('records')) print("Finished calling censusgeocode addressbatch") df_output_chunk = pd.DataFrame.from_dict(dict_output_chunk) df_output_chunk_merged = df_input_chunk.merge(df_output_chunk, on='address') return df_output_chunk_merged
def wrapper_api_call(df_input_addresses_queued): """ API Wrapper (in this case censusgeocode pkg) call to return geo_codes. For censusgeocode pkg, input is a list of dict (input batch has to be less than 1000 addresses) and the output will be a list of [original address df, OrderedDict with geocodes]. Dictionary has to have columns as street, city, state, and zip. (ID autogenerated in censusgeocode pkg). """ logging.info("Worker is now working.") geo_codes_ordered_dict = cg.addressbatch( df_input_addresses_queued.to_dict('records')) logging.info("Worker's work is complete.") return geo_codes_ordered_dict
def batch_function(num): try: time.sleep(num % 100) temp_filename = "dummy{num}.csv".format(num=num) new_addresses.loc[num * 100:num * 100 + 99, ("address", "city", "state", "zip")].to_csv(temp_filename) z = censusgeocode.addressbatch(temp_filename) os.remove(temp_filename) print num return [[i['address'], i['lat'], i['lon']] for i in z] except: os.remove(temp_filename) print "error on {num}".format(num=num)
def geocode_batch(df): _batch_prep(df) # todo: is there anything I can do to increase the number of matches---about 89% of observations have a match df['id'] = df.index vars = ['id', 'last_name', 'age', 'sex', 'postal_abbreviation'] df_census = pd.DataFrame( cg.addressbatch('/Users/travis.howe/Downloads/test_address2.csv', returntype='geographies')) df_census['id'] = df_census['id'].astype(int) joblib.dump( df[vars].merge(df_census, how='outer', on='id', indicator=True), 'results.pkl')
def get_census_batch(self, addresses): ''' This function sends the input addresses through the address parser and returns them in the format the Census geocoder wants. ''' address_column, city_column, state_column, zip_column = [], [], [], [] for address in addresses: street_address, city, state, zip_code = self.parse_address(address) address_column.append(street_address) city_column.append(city) state_column.append(state) zip_column.append(zip_code) census_batch = pd.DataFrame(list(zip(address_column, city_column, state_column, zip_column))) census_batch.to_csv(self.batch_filename, header=None) results = cg.addressbatch(self.batch_filename) results_df = pd.DataFrame.from_dict(results) results_df.to_csv(self.census_results_filename)
def process_batch(batch_path): print(f"Begin processing {batch_path}...") geocoded_results = cg.addressbatch(batch_path, timeout=1000) print(f"Finish processing {batch_path}...") return geocoded_results
import censusgeocode as cg data = cg.addressbatch('addresses.csv') print(data) for address in data: print(address['id']) print(address['lat']) print(address['lon'])
import censusgeocode as cg import os import pandas as pd if not os.path.exists("./address_chunks/results"): os.makedirs("./address_chunks/results") #path = os.path.dirname(__file__) + '/../../dataRAW/texasPractice/' directory = "./address_chunks/" for file in os.listdir(directory): filename = os.fsdecode(file) geocodeResults = pd.DataFrame( cg.addressbatch(directory + filename, delim=',')) geocodeResults.to_csv(directory + 'results/R' + filename + '.csv') print(filename + " completed!")
def geocode_nfirs(year): """Geocode files within temp directory (created with create_nfirs_temp function). Writes output geocoded files to temp_output folder. Uses censusgeocode library to access the census geocoder api, described at https://geocoding.geo.census.gov/ Args: year: int, year to consolidate Returns: None """ # Directory paths nfirs_interim = utils.DATA['interim'] / 'nfirs' temp_input = nfirs_interim / f'temp_{year}' / 'input' temp_output = nfirs_interim / f'temp_{year}' / 'output' # Start and current time start_time = datetime.now() cur_time = datetime.now() # Geocode the files for filename in os.listdir(temp_input): input_path = os.path.join(temp_input, filename) output_path = os.path.join(temp_output, f'{filename[:-4]}_output.csv') if os.path.exists(output_path): print(f'{filename} already geocoded.') continue #Try up to 10 attempts to geocode file. Sometimes the connection to Census API will #time out and cause the particular file to fail to be geocoded on that attempt. for attempt in range(10): try: # Sleep random amount to keep from being banned from geocoder api sleep(np.random.randint(1, 5)) # Print time info cur_time = datetime.now() print('\nCurrent time:', cur_time.strftime('%H:%M:%S')) print(f'Geocoding: {filename}') # Geocode file results = pd.DataFrame( cg.addressbatch(input_path)).sort_values('id') results.to_csv(output_path, index=False) # Print time info step_time = (datetime.now() - cur_time).total_seconds() / 60 total_time = (datetime.now() - start_time).total_seconds() / 60 print(f'Step time elapsed: {step_time:.02f} minutes') print(f'Total time elapsed: {total_time:.02f} minutes') break except: print(f'Failed on try {attempt}') continue print('\n\nFinished geocoding.\n') return