def test_stringio(self): with open(self.small_path, 'r') as f: if six.PY3: sample = io.StringIO(f.read()) else: sample = io.BytesIO(f.read()) result = censusbatchgeocoder.geocode(sample) self.assertEqual(len(result), 5)
def test_wide(self): result = censusbatchgeocoder.geocode( self.wide_path, id="Affidavit ID", address="Street", city="City", state="State", zipcode="Zip" ) self.assertEqual(len(result), 10)
def test_weird_headers(self): result = censusbatchgeocoder.geocode( self.weird_path, id="foo", address="bar", city="baz", state="bada", zipcode="boom" ) self.assertEqual(len(result), 5)
def test_extra_columns(self): result = censusbatchgeocoder.geocode(self.extra_path) self.assertEqual( [d['metadata_1'] for d in result], ['foo', 'bar', 'baz', 'bada', 'bing'] ) self.assertEqual( [d['metadata_2'] for d in result], ['eenie', 'meenie', 'miney', 'moe', 'catch a tiger by the toe'] ) self.assertEqual(len(result), 5)
def test_bom(self): result = censusbatchgeocoder.geocode( self.bom_path, id="Affidavit ID", address="Street", city="City", state="State", zipcode="Zip", encoding="utf-8-sig" ) self.assertEqual(len(result), 4)
def run(self): try: results = censusbatchgeocoder.geocode(self.data) except: traceback.print_exc() exctype, value = sys.exc_info()[:2] self.signals.error.emit((exctype, value, traceback.format_exc())) else: self.signals.result.emit(results) # Return result finally: self.signals.finished.emit() # Done
def test_list(self): my_list = [{ 'address': '521 SWARTHMORE AVENUE', 'city': 'PACIFIC PALISADES', 'id': '1', 'state': 'CA', 'zipcode': '90272-4350'}, { 'address': '2015 W TEMPLE STREET', 'city': 'LOS ANGELES', 'id': '2', 'state': 'CA', 'zipcode': '90026-4913' }] result = censusbatchgeocoder.geocode(my_list) self.assertEqual(len(result), 2)
def geocode_batch(start_idx, batch_size=batch_size): try: start_time = time.time() end_idx = start_idx + batch_size batch_df = df_raw.iloc[start_idx:end_idx][:] dict_lst = batch_df.to_dict('records') result_dicts = censusbatchgeocoder.geocode(dict_lst, pooling=False) update_query = ';'.join([gen_update_q(d) for d in result_dicts]) curr.execute(update_query) print('thread finished for batch {} size: {} in {} seconds'.format( (start_idx, end_idx), batch_size, time.time() - start_time)) return True except: traceback.print_exc(file=sys.stdout) return False
def geocode(f, seattle_acs, new_name): df = pd.read_csv(f) print("size of df here", df.shape) df['RegStNum'] = df['RegStNum'].apply(str) print("got here") df['address'] = df['RegStNum'] + " " + df['RegStName'] + " " + df[ 'RegStType'] df['city'] = df['RegCity'].copy() df['state'] = df['RegState'].copy() df['zipcode'] = df['RegZipCode'].copy() df['id'] = df.index filtered_df = df[['address', "city", "state", "zipcode", "id"]] result = censusbatchgeocoder.geocode(filtered_df.to_dict("records")) filtered_df = pd.DataFrame(result) print("filtered df size before merge", filtered_df.shape) df = pd.merge(df, filtered_df[['id', 'tract']], how="left", left_on="id", right_on="id") print("df size before dropping null tracts", df.shape) df = df[df['tract'].notnull()] print("df size after dropping null tracts", df.shape) print("adding census tract income quintile rank for each address") df2 = seattle_acs # make both tract columns ints to avoid this error: "ValueError: You are trying to merge on object and int64 columns. If you wish to proceed you should use pd.concat" df["tract"] = df["tract"].astype(int) df = pd.merge(df, df2[['TRACT', 'income_quintile']], left_on='tract', right_on="TRACT", how='left') df.to_csv(new_name) print("DONE")
def test_path(self): result = censusbatchgeocoder.geocode(self.small_path) self.assertEqual(len(result), 5)
def test_coordinates(self): result = censusbatchgeocoder.geocode(self.small_path) for row in result: self.assertTrue('latitude' in row) self.assertTrue('longitude' in row)
def test_batch_size(self): result = censusbatchgeocoder.geocode(self.small_path, batch_size=2) self.assertEqual(len(result), 5)
def test_nopooling(self): result = censusbatchgeocoder.geocode(self.small_path, pooling=False) self.assertEqual(len(result), 5)
def test_no_state_and_zipcode(self): result = censusbatchgeocoder.geocode(self.incomplete_path, state=None, zipcode=None) self.assertEqual(len(result), 5)
def test_big_batch(self): result = censusbatchgeocoder.geocode(self.big_path) self.assertEqual(len(result), 1498)
print(datetime.datetime.now()) addressdata.head() # In[ ]: addressdata.shape # In[ ]: fetchaddress = addressdata.to_dict("records") # In[ ]: print(datetime.datetime.now()) results = censusbatchgeocoder.geocode(fetchaddress.to_dict("records"), zipcode=None) print(datetime.datetime.now()) #2019-10-18 20:07:03.927510 #2019-10-18 20:10:55.237334 # In[ ]: pd_df = pd.DataFrame(results) # In[ ]: pd_df.to_csv( 'C:\\Data Analytics\\Sem 3\\ICT Solution\\Data Sets\\extractgeocodesdata.csv', index=False)
import pandas as pd import censusbatchgeocoder import logging logger = logging.getLogger() logger.setLevel(logging.DEBUG) df = pd.read_excel( "~/Code/python-censusbatchgeocoder-example/privateschools1617.xls", skiprows=3) result = censusbatchgeocoder.geocode( df.to_dict("records")[770:775], id="Affidavit ID", address="Street", city="City", state="State", zipcode="Zip", )