def build_block_maps(pkg): cache = get_cache(pkg) states = list(stusab.values()) grid_key = 'blocks/map/source/utm' cache.put(grid_key, pkg.reference('utm_grid').geoframe()) cbsa_key = 'blocks/map/source/cbsa' cache.put(cbsa_key, pkg.reference('cbsa').geoframe().to_crs(4326)) tasks = [(cache, st, grid_key, cbsa_key) for st in states] try: import appnope with appnope.nope_scope(): r = run_mp(_f_block_maps, tasks) except ImportError: r = run_mp(_f_block_maps, tasks) cbsa_map = pd.concat([cache.get(e[0]) for e in r])\ utm_map = pd.concat([cache.get(e[1]) for e in r]) pkg_root = Path(pkg.path).parent cbsa_map.to_csv(pkg_root.joinpath('data', 'cbsa_map.csv'), index=False) utm_map.to_csv(pkg_root.joinpath('data', 'utm_map.csv'), index=False)
def build_clusters(pkg): cache = get_cache(pkg) cluster_logger.info('Caching source points by CBSA') cache_points_cbsa(pkg, cache) cache_lines_cbsa(pkg, cache) cluster_logger.info('Start MP run') tasks = [(cache, e.stem) for e in cache.list('clusters/source_points/cbsa')] r = run_mp(run_cbsa_clusters, tasks) cluster_logger.info('Assemble metro points') metro_point_keys = [k1 for k1, k2 in r if not isinstance(k1, Exception)] frames = [cache.get(k) for k in tqdm(metro_point_keys)] metro_points = pd.concat(frames) cluster_logger.info('Assemble clusters') cluster_keys = [k2 for k1, k2 in r if not isinstance(k1, Exception)] frames = [cache.get(k) for k in tqdm(cluster_keys) if cache.exists(k)] clusters = pd.concat(frames) cluster_logger.info('Write files') pkg_root = Path(pkg.path).parent metro_points.to_csv(pkg_root.joinpath('data', 'metro_points.csv'), index=False) clusters.to_csv(pkg_root.joinpath('data', 'business_clusters.csv'), index=False)
def split_blocks(pkg): """Download block files and cache them""" cache = get_cache(pkg) states = list(stusab.values()) keys = run_mp(_f_get_split_blocks, [(st, cache, pkg.reference('block_templ').url.format(st=st)) for st in states], n_cpu=4) # 4 cpu b/c we're downloading return keys
def get_tags_df(self, nrows=None, force=False): """Extract all of the OSM points records that have useful other_tags, add geohashes, and write the file to the cache""" tqdm.pandas() # Add progress_apply() key = self.osm_key + '/tags' if self.cache.exists(key) and not force: return self.cache.get_df(key) p = self.cache.joinpath(self.csv_key, 'points.csv') logger.debug('Loading points file') df = pd.read_csv(p, low_memory=False, nrows=nrows) logger.debug('Generate tasks') tasks = [(e, self.extract_tags) for e in np.array_split(df, 200)] results = run_mp(OsmProcessor.do_extract_tags, tasks, 'Split OSM other_tags') self.tags = list(chain(*[e[0] for e in results])) self.errors = list(chain(*[e[1] for e in results])) logger.debug('Create tags df') tags_df = pd.DataFrame(self.tags, columns=['osm_id'] + self.extract_tags) # 1/2 the entries, 2.7M are trees and rocks logger.debug('Remove trees and rock') tags_df = tags_df[~tags_df.natural.isin(['tree', 'rock'])] logger.debug('Merge geometry') tags_df = pd.merge(tags_df, df[['osm_id', 'geometry']], on='osm_id') def encode(v): return gh.encode(*list(map(float, v[7:-1].split()))[::-1]) logger.debug('Add geohash') tags_df['geohash'] = tags_df.geometry.progress_apply(encode) logger.debug('Convert to geopandas') tags_df['geometry'] = tags_df.geometry.progress_apply( shapely.wkt.loads) tags_df = gpd.GeoDataFrame(tags_df, geometry='geometry', crs=4326) logger.debug('Write to file') self.cache.put_df(key, tags_df) return tags_df
def join_blocks(pkg, break_starts): """Join census blocks and OSM points""" cache = get_cache(pkg) states = list(stusab.values()) tasks = list(e + (cache,) for e in product(break_starts, states)) keys = run_mp(_f_join_blocks, tasks) joins = [e for e in keys if not isinstance(e, Exception)] exn = [e for e in keys if isinstance(e, Exception)] return joins
def simplify_lines(pkg, recombine_keys): cache = get_cache(pkg) try: # Returned the cached keys if this is already done return cache.get('simplified/simplified_keys') except KeyError: pass simplified_keys = run_mp(f_simplify_lines, [(cache.root, e) for e in recombine_keys], desc='Simplify') simplified_keys = list(chain(*simplified_keys)) cache.put('simplified/simplified_keys', simplified_keys) return simplified_keys
def make_tags_df(pkg): """Create the tags dataframe""" cache = open_cache(pkg) points_logger.info('Make tags dataframe') try: tags_df = cache.get_df('points/tags_df') except KeyError: points_df = pkg.reference('points').read_csv(low_memory=False) # Split the file and extract tags in multiprocessing N_task = 200 tasks = [(e, extract_tag_names) for e in np.array_split(points_df, N_task)] results = run_mp(extract_tags, tasks, 'Split OSM other_tags') tags = list(chain(*[e[0] for e in results])) errors = list(chain(*[e[1] for e in results])) tags_df = pd.DataFrame(tags, columns=['osm_id'] + extract_tag_names) # 1/2 the entries, 2.7M are trees and rocks tags_df = tags_df[~tags_df.natural.isin(['tree', 'rock'])] tags_df = pd.merge(tags_df, points_df[['osm_id', 'geometry']], on='osm_id') def encode(v): return gh.encode(*list(map(float, v[7:-1].split()))[::-1]) tags_df['geohash'] = tags_df.geometry.progress_apply(encode) tags_df['geometry'] = tags_df.geometry.progress_apply( shapely.wkt.loads) tags_df = gpd.GeoDataFrame(tags_df, geometry='geometry', crs=4326) cache.put_df('points/tags_df', tags_df) return tags_df
def run_overlay(pkg, splits, force=False): cache = get_cache(pkg) if not force: try: # Returned the cached keys if this is already done recombine_keys = cache.get('recombine/recombine_keys') if len(recombine_keys) == len(splits): return recombine_keys except KeyError: pass tasks = [[cache.root, e, ro_key(e)] for e in splits] recombine_keys = run_mp(f_run_overlay, tasks, desc='Overlay Geohash') cache.put('recombine/recombine_keys', recombine_keys) return list(filter(bool, recombine_keys))