class RGeocoder(object): """ The main reverse geocoder class """ def __init__(self, mode=2, verbose=True, stream=None): """ Class Instantiation Args: mode (int): Library supports the following two modes: - 1 = Single-threaded K-D Tree - 2 = Multi-threaded K-D Tree (Default) verbose (bool): For verbose output, set to True stream (io.StringIO): An in-memory stream of a custom data source """ self.mode = mode self.verbose = verbose if stream: coordinates, self.locations = self.load(stream) else: coordinates, self.locations = self.extract(rel_path(RG_FILE)) if mode == 1: # Single-process self.tree = KDTree(coordinates) else: # Multi-process self.tree = KDTree_MP.cKDTree_MP(coordinates) def query(self, coordinates): """ Function to query the K-D tree to find the nearest city Args: coordinates (list): List of tuple coordinates, i.e. [(latitude, longitude)] """ if self.mode == 1: _, indices = self.tree.query(coordinates, k=1) else: _, indices = self.tree.pquery(coordinates, k=1) return [self.locations[index] for index in indices] def load(self, stream): """ Function that loads a custom data source Args: stream (io.StringIO): An in-memory stream of a custom data source. The format of the stream must be a comma-separated file with header containing the columns defined in RG_COLUMNS. """ stream_reader = csv.DictReader(stream, delimiter=',') header = stream_reader.fieldnames if header != RG_COLUMNS: raise csv.Error('Input must be a comma-separated file with header containing ' + \ 'the following columns - %s. For more help, visit: ' % (','.join(RG_COLUMNS)) + \ 'https://github.com/thampiman/reverse-geocoder') # Load all the coordinates and locations geo_coords, locations = [], [] for row in stream_reader: geo_coords.append((row['lat'], row['lon'])) locations.append(row) return geo_coords, locations def extract(self, local_filename): """ Function loads the already extracted GeoNames cities file or downloads and extracts it if it doesn't exist locally Args: local_filename (str): Path to local RG_FILE """ if os.path.exists(local_filename): if self.verbose: print('Loading formatted geocoded file...') rows = csv.DictReader(open(local_filename, 'rt')) else: gn_cities_US_url = GN_URL + GN_CITIES_US + '.zip' gn_admin1_url = GN_URL + GN_ADMIN1 gn_admin2_url = GN_URL + GN_ADMIN2 cities_US_zipfilename = GN_CITIES_US + '.zip' cities_US_filename = GN_CITIES_US + '.txt' if not os.path.exists(cities_US_zipfilename): if self.verbose: print('Downloading files from Geoname...') try: # Python 3 import urllib.request urllib.request.urlretrieve(gn_cities_US_url, cities_US_zipfilename) urllib.request.urlretrieve(gn_admin1_url, GN_ADMIN1) urllib.request.urlretrieve(gn_admin2_url, GN_ADMIN2) except ImportError: # Python 2 import urllib urllib.urlretrieve(gn_cities_US_url, cities_US_zipfilename) urllib.urlretrieve(gn_admin1_url, GN_ADMIN1) urllib.urlretrieve(gn_admin2_url, GN_ADMIN2) if self.verbose: print('Extracting cities_US...') _z = zipfile.ZipFile(open(cities_US_zipfilename, 'rb')) open(cities_US_filename, 'wb').write(_z.read(cities_US_filename)) if self.verbose: print('Loading admin1 codes...') admin1_map = {} t_rows = csv.reader(open(GN_ADMIN1, 'rt'), delimiter='\t') for row in t_rows: admin1_map[row[ADMIN_COLUMNS['concatCodes']]] = row[ ADMIN_COLUMNS['asciiName']] if self.verbose: print('Loading admin2 codes...') admin2_map = {} for row in csv.reader(open(GN_ADMIN2, 'rt'), delimiter='\t'): admin2_map[row[ADMIN_COLUMNS['concatCodes']]] = row[ ADMIN_COLUMNS['asciiName']] if self.verbose: print('Creating formatted geocoded file...') writer = csv.DictWriter(open(local_filename, 'wt'), fieldnames=RG_COLUMNS) rows = [] for row in csv.reader(open(cities_US_filename, 'rt'), \ delimiter='\t', quoting=csv.QUOTE_NONE): lat = row[GN_COLUMNS['latitude']] lon = row[GN_COLUMNS['longitude']] name = row[GN_COLUMNS['asciiName']] cc = row[GN_COLUMNS['countryCode']] admin1_c = row[GN_COLUMNS['admin1Code']] admin2_c = row[GN_COLUMNS['admin2Code']] cc_admin1 = cc + '.' + admin1_c cc_admin2 = cc + '.' + admin1_c + '.' + admin2_c admin1 = '' admin2 = '' if cc_admin1 in admin1_map: admin1 = admin1_map[cc_admin1] if cc_admin2 in admin2_map: admin2 = admin2_map[cc_admin2] write_row = { 'lat': lat, 'lon': lon, 'name': name, 'admin1': admin1, 'admin2': admin2, 'cc': cc } rows.append(write_row) writer.writeheader() writer.writerows(rows) if self.verbose: print('Removing extracted cities_US to save space...') os.remove(cities_US_filename) # Load all the coordinates and locations geo_coords, locations = [], [] for row in rows: geo_coords.append((row['lat'], row['lon'])) locations.append(row) return geo_coords, locations
class RGeocoder: def __init__(self, mode=2): coordinates, self.locations = self.extract(rel_path(RG_FILE)) self.mode = mode if mode == 1: # Single-process self.tree = KDTree(coordinates) else: # Multi-process self.tree = KDTree_MP.cKDTree_MP(coordinates) def query(self, coordinates): try: if self.mode == 1: distances, indices = self.tree.query(coordinates, k=1) else: distances, indices = self.tree.pquery(coordinates, k=1) except ValueError as e: raise e else: return [self.locations[index] for index in indices] def extract(self, local_filename): if os.path.exists(local_filename): print('Loading formatted geocoded file...') rows = csv.DictReader(open(local_filename, 'rt')) else: gn_cities1000_url = GN_URL + GN_CITIES1000 + '.zip' gn_admin1_url = GN_URL + GN_ADMIN1 gn_admin2_url = GN_URL + GN_ADMIN2 cities1000_zipfilename = GN_CITIES1000 + '.zip' cities1000_filename = GN_CITIES1000 + '.txt' if not os.path.exists(cities1000_zipfilename): print('Downloading files from Geoname...') try: # Python 3 import urllib.request urllib.request.urlretrieve(gn_cities1000_url, cities1000_zipfilename) urllib.request.urlretrieve(gn_admin1_url, GN_ADMIN1) urllib.request.urlretrieve(gn_admin2_url, GN_ADMIN2) except ImportError: # Python 2 import urllib urllib.urlretrieve(gn_cities1000_url, cities1000_zipfilename) urllib.urlretrieve(gn_admin1_url, GN_ADMIN1) urllib.urlretrieve(gn_admin2_url, GN_ADMIN2) print('Extracting cities1000...') z = zipfile.ZipFile(open(cities1000_zipfilename, 'rb')) open(cities1000_filename, 'wb').write(z.read(cities1000_filename)) print('Loading admin1 codes...') admin1_map = {} t_rows = csv.reader(open(GN_ADMIN1, 'rt'), delimiter='\t') for row in t_rows: admin1_map[row[ADMIN_COLUMNS['concatCodes']]] = row[ ADMIN_COLUMNS['asciiName']] print('Loading admin2 codes...') admin2_map = {} for row in csv.reader(open(GN_ADMIN2, 'rt'), delimiter='\t'): admin2_map[row[ADMIN_COLUMNS['concatCodes']]] = row[ ADMIN_COLUMNS['asciiName']] print('Creating formatted geocoded file...') writer = csv.DictWriter(open(local_filename, 'wt'), fieldnames=RG_COLUMNS) rows = [] for row in csv.reader(open(cities1000_filename, 'rt'), delimiter='\t', quoting=csv.QUOTE_NONE): lat = row[GN_COLUMNS['latitude']] lon = row[GN_COLUMNS['longitude']] name = row[GN_COLUMNS['asciiName']] cc = row[GN_COLUMNS['countryCode']] admin1_c = row[GN_COLUMNS['admin1Code']] admin2_c = row[GN_COLUMNS['admin2Code']] cc_admin1 = cc + '.' + admin1_c cc_admin2 = cc + '.' + admin1_c + '.' + admin2_c admin1 = '' admin2 = '' if cc_admin1 in admin1_map: admin1 = admin1_map[cc_admin1] if cc_admin2 in admin2_map: admin2 = admin2_map[cc_admin2] write_row = { 'lat': lat, 'lon': lon, 'name': name, 'admin1': admin1, 'admin2': admin2, 'cc': cc } rows.append(write_row) writer.writeheader() writer.writerows(rows) print('Removing extracted cities1000 to save space...') os.remove(cities1000_filename) # Load all the coordinates and locations geo_coords, locations = [], [] for row in rows: geo_coords.append((row['lat'], row['lon'])) locations.append(row) ecef_coords = geodetic_in_ecef(geo_coords) return ecef_coords, locations
class OSM_RG: def __init__(self, mode=1, precision_mode=2): self.mode = mode if precision_mode == 0: loc_path = RG_FILE_1000 elif precision_mode == 1: loc_path = RG_FILE_5000 else: loc_path = RG_FILE_15000 if not os.path.exists(loc_path): loc_path = rel_path(loc_path) # coordinates, locations = self.extract(path) coordinates, self.locations = self.extract(loc_path) if mode == 1: # Single-process self.tree = KDTree(coordinates) else: # Multi-process self.tree = KDTree_MP.cKDTree_MP(coordinates) def query(self, coordinates): """ Find closest match to this list of coordinates """ try: if self.mode == 1: distances, indices = self.tree.query(coordinates, k=1) else: distances, indices = self.tree.pquery(coordinates, k=1) except ValueError as e: raise e else: return [self.locations[index] for index in indices] def extract(self, local_filename): """ Extract geocode data from zip """ if os.path.exists(local_filename): df = pd.read_csv(local_filename) # rows = csv.DictReader(open(local_filename, "rt")) elif "rg_cities" in local_filename: url_filename = \ local_filename[ local_filename.rfind("/") + 4:local_filename.rfind(".")] gn_cities_url = GN_URL + url_filename + ".zip" cities_zipfilename = url_filename + ".zip" cities_filename = url_filename + ".txt" if not os.path.exists(cities_zipfilename): import urllib.request urllib.request.urlretrieve(gn_cities_url, cities_zipfilename) z = zipfile.ZipFile(open(cities_zipfilename, "rb")) open(cities_filename, "wb").write(z.read(cities_filename)) df = pd.read_csv(cities_filename, delimiter="\t", names=sorted(GN_COLUMNS, key=GN_COLUMNS.get)) df.drop([x for x in GN_COLUMNS if x not in GN_COLUMNS_OF_INTEREST], axis=1, inplace=True) geolocator = osm_geolocator() address_list = [] with click.progressbar(length=len(df), label="reversing geodata") as bar: for i, row in df.iterrows(): time.sleep(1) bar.update(1) lat = row["latitude"] lon = row["longitude"] address = None while address is None: try: address = geolocator.reverse( (lat, lon), timeout=10, language="en").raw["address"] except Exception as ex: time.sleep(5) address = None address_list.append(address) df["address"] = address_list df.rename(columns={"latitude": "lat", "longitude": "lon", "geoNameId": "geo_id"}, inplace=True) df.to_csv(local_filename, index=False) os.remove(cities_filename) os.remove(cities_zipfilename) parse_address(local_filename) else: raise Exception("Geocoded file not found", local_filename) df.dropna(subset=COLUMNS_OF_INTEREST, inplace=True) df.drop_duplicates(subset=["city"], inplace=True) # Load all the coordinates and locations geo_coords, locations = [], [] for i, row in df.iterrows(): geo_coords.append((float(row["lat"]), float(row["lon"]))) locations.append( {k: v for k, v in row.items() if k in COLUMNS_OF_INTEREST}) return geo_coords, locations
class RGeocoder: def __init__(self,mode=2): coordinates, self.locations = self.extract(rel_path(RG_FILE)) self.mode = mode if mode == 1: # Single-process self.tree = KDTree(coordinates) else: # Multi-process self.tree = KDTree_MP.cKDTree_MP(coordinates) def query(self,coordinates): try: if self.mode == 1: distances,indices = self.tree.query(coordinates,k=1) else: distances,indices = self.tree.pquery(coordinates,k=1) except ValueError as e: raise e else: return [self.locations[index] for index in indices] def extract(self,local_filename): if os.path.exists(local_filename): print('Loading formatted geocoded file...') rows = csv.DictReader(open(local_filename,'rt')) else: gn_cities1000_url = GN_URL + GN_CITIES1000 + '.zip' gn_admin1_url = GN_URL + GN_ADMIN1 gn_admin2_url = GN_URL + GN_ADMIN2 cities1000_zipfilename = GN_CITIES1000 + '.zip' cities1000_filename = GN_CITIES1000 + '.txt' if not os.path.exists(cities1000_zipfilename): print('Downloading files from Geoname...') try: # Python 3 import urllib.request urllib.request.urlretrieve(gn_cities1000_url,cities1000_zipfilename) urllib.request.urlretrieve(gn_admin1_url,GN_ADMIN1) urllib.request.urlretrieve(gn_admin2_url,GN_ADMIN2) except ImportError: # Python 2 import urllib urllib.urlretrieve(gn_cities1000_url,cities1000_zipfilename) urllib.urlretrieve(gn_admin1_url,GN_ADMIN1) urllib.urlretrieve(gn_admin2_url,GN_ADMIN2) print('Extracting cities1000...') z = zipfile.ZipFile(open(cities1000_zipfilename,'rb')) open(cities1000_filename,'wb').write(z.read(cities1000_filename)) print('Loading admin1 codes...') admin1_map = {} t_rows = csv.reader(open(GN_ADMIN1,'rt'),delimiter='\t') for row in t_rows: admin1_map[row[ADMIN_COLUMNS['concatCodes']]] = row[ADMIN_COLUMNS['asciiName']] print('Loading admin2 codes...') admin2_map = {} for row in csv.reader(open(GN_ADMIN2,'rt'),delimiter='\t'): admin2_map[row[ADMIN_COLUMNS['concatCodes']]] = row[ADMIN_COLUMNS['asciiName']] print('Creating formatted geocoded file...') writer = csv.DictWriter(open(local_filename,'wt'),fieldnames=RG_COLUMNS) rows = [] for row in csv.reader(open(cities1000_filename,'rt'),delimiter='\t',quoting=csv.QUOTE_NONE): lat = row[GN_COLUMNS['latitude']] lon = row[GN_COLUMNS['longitude']] name = row[GN_COLUMNS['asciiName']] cc = row[GN_COLUMNS['countryCode']] admin1_c = row[GN_COLUMNS['admin1Code']] admin2_c = row[GN_COLUMNS['admin2Code']] cc_admin1 = cc+'.'+admin1_c cc_admin2 = cc+'.'+admin1_c+'.'+admin2_c admin1 = '' admin2 = '' if cc_admin1 in admin1_map: admin1 = admin1_map[cc_admin1] if cc_admin2 in admin2_map: admin2 = admin2_map[cc_admin2] write_row = {'lat':lat,'lon':lon,'name':name,'admin1':admin1,'admin2':admin2,'cc':cc} rows.append(write_row) writer.writeheader() writer.writerows(rows) print('Removing extracted cities1000 to save space...') os.remove(cities1000_filename) # Load all the coordinates and locations geo_coords,locations = [],[] for row in rows: geo_coords.append((row['lat'],row['lon'])) locations.append(row) ecef_coords = geodetic_in_ecef(geo_coords) return geo_coords,locations
class RGeocoderImpl: """ The main reverse geocoder class """ def __init__(self, mode=2, verbose=True, stream=None, stream_columns=None): """ Class Instantiation Args:` mode (int): Library supports the following two modes: - 1 = Single-threaded K-D Tree - 2 = Multi-threaded K-D Tree (Default) verbose (bool): For verbose output, set to True stream (io.StringIO): An in-memory stream of a custom data source """ self.mode = mode self.verbose = verbose if stream: coordinates, self.locations = self.load(stream, stream_columns) else: coordinates, self.locations = self.extract(rel_path(RG_FILE)) if mode == 1: # Single-process self.tree = KDTree(coordinates) else: # Multi-process self.tree = KDTree_MP.cKDTree_MP(coordinates) @classmethod def from_data(cls, data: str): return cls(stream=io.StringIO(data)) @classmethod def from_files(cls, location_files: list): """ Loading files data into a stream and creating new instance. Arguments: location_files {list} -- list of files with lat, lon and additional info on the coord Returns: [RGeocoderImpl] """ data_stream = RGeocoderDataLoader.load_files_stream(location_files) return cls(stream=data_stream) def query(self, coordinates): """ Function to query the K-D tree to find the nearest city Args: coordinates (list): List of tuple coordinates, i.e. [(latitude, longitude)] """ if self.mode == 1: _, indices = self.tree.query(coordinates, k=1) else: _, indices = self.tree.pquery(coordinates, k=1) return [self.locations[index] for index in indices] def query_dist(self, coordinates): """ Function to query the K-D tree to find the nearest city Args: coordinates (list): List of tuple coordinates, i.e. [(latitude, longitude)] """ if self.mode == 1: dists, indices = self.tree.query(coordinates, k=1) else: dists, indices = self.tree.pquery(coordinates, k=1) # in pquery dists returns a list of arrays so get the first element instead of returning array dists = [dist[0] for dist in dists] return [(dists[n], self.locations[index]) for (n, index) in enumerate(indices)] def load(self, stream, stream_columns): """ Function that loads a custom data source Args: stream (io.StringIO): An in-memory stream of a custom data source. The format of the stream must be a comma-separated file. """ print('Loading geocoded stream ...') stream_reader = csv.DictReader(stream, delimiter=',') header = stream_reader.fieldnames if stream_columns and header != stream_columns: raise csv.Error('Input must be a comma-separated file with header containing ' + \ 'the following columns - %s.\nFound header - %s.\nFor more help, visit: ' % (','.join(stream_columns), ','.join(header)) + \ 'https://github.com/thampiman/reverse-geocoder') # Load all the coordinates and locations geo_coords, locations = [], [] for row in stream_reader: geo_coords.append((row['lat'], row['lon'])) locations.append(row) return geo_coords, locations def extract(self, local_filename): """ Function loads the already extracted GeoNames cities file or downloads and extracts it if it doesn't exist locally Args: local_filename (str): Path to local RG_FILE """ if os.path.exists(local_filename): if self.verbose: print('Loading formatted geocoded file ...') rows = csv.DictReader(open(local_filename, 'rt')) else: rows = self.do_extract(GN_CITIES1000, local_filename) # Load all the coordinates and locations geo_coords, locations = [], [] for row in rows: geo_coords.append((row['lat'], row['lon'])) locations.append(row) return geo_coords, locations def do_extract(self, geoname_file, local_filename): gn_cities_url = GN_URL + geoname_file + '.zip' gn_admin1_url = GN_URL + GN_ADMIN1 gn_admin2_url = GN_URL + GN_ADMIN2 cities_zipfilename = geoname_file + '.zip' cities_filename = geoname_file + '.txt' if not os.path.exists(cities_zipfilename): if self.verbose: print('Downloading files from Geoname...') import urllib.request urllib.request.urlretrieve(gn_cities_url, cities_zipfilename) urllib.request.urlretrieve(gn_admin1_url, GN_ADMIN1) urllib.request.urlretrieve(gn_admin2_url, GN_ADMIN2) if self.verbose: print('Extracting %s...' % geoname_file) _z = zipfile.ZipFile(open(cities_zipfilename, 'rb')) open(cities_filename, 'wb').write(_z.read(cities_filename)) if self.verbose: print('Loading admin1 codes...') admin1_map = {} t_rows = csv.reader(open(GN_ADMIN1, 'rt'), delimiter='\t') for row in t_rows: admin1_map[row[ADMIN_COLUMNS['concatCodes']]] = row[ ADMIN_COLUMNS['asciiName']] if self.verbose: print('Loading admin2 codes...') admin2_map = {} for row in csv.reader(open(GN_ADMIN2, 'rt'), delimiter='\t'): admin2_map[row[ADMIN_COLUMNS['concatCodes']]] = row[ ADMIN_COLUMNS['asciiName']] if self.verbose: print('Creating formatted geocoded file...') writer = csv.DictWriter(open(local_filename, 'wt'), fieldnames=RG_COLUMNS) rows = [] for row in csv.reader(open(cities_filename, 'rt'), delimiter='\t', quoting=csv.QUOTE_NONE): lat = row[GN_COLUMNS['latitude']] lon = row[GN_COLUMNS['longitude']] name = row[GN_COLUMNS['asciiName']] cc = row[GN_COLUMNS['countryCode']] admin1_c = row[GN_COLUMNS['admin1Code']] admin2_c = row[GN_COLUMNS['admin2Code']] cc_admin1 = cc + '.' + admin1_c cc_admin2 = cc + '.' + admin1_c + '.' + admin2_c admin1 = '' admin2 = '' if cc_admin1 in admin1_map: admin1 = admin1_map[cc_admin1] if cc_admin2 in admin2_map: admin2 = admin2_map[cc_admin2] write_row = { 'lat': lat, 'lon': lon, 'name': name, 'admin1': admin1, 'admin2': admin2, 'cc': cc } rows.append(write_row) writer.writeheader() writer.writerows(rows) if self.verbose: print('Removing extracted %s to save space...' % geoname_file) os.remove(cities_filename) return rows