def f_run_overlay(cache_dir, key, okey): cache = FileCache(cache_dir) if cache.exists(okey): return okey t = cache.get_df(key) utm = cache.get_df('utm_grid') t = t[t.highway.isin(list(hw_type.keys()))] t['highway'] = t.highway.replace(hw_type) # Cuts file size by 100M t['geometry'] = t.geometry.apply(shapely.wkt.loads) if len(t) == 0: return None gdf = gpd.GeoDataFrame(t, crs=4326) try: t = gpd.overlay(gdf, utm) try: cache.put_df(okey, t) except: if cache.exists(okey): cache.delete(key) raise except IndexError as e: raise LPError( f"Failed for {key} gdf:{len(gdf)} hashes:{len(utm)}: {e}") return okey
def __init__(self, pkg, cache=None): self.pkg = pkg self.pkg_root = Path(self.pkg.path).parent self._df = None self._agg_map = None if cache is None: self._cache = FileCache(self.pkg_root.joinpath('data', 'cache')) else: self._cache = cache
def write_files(pkg, simplified_keys): pkg_root = Path(pkg.path).parent cache = FileCache(pkg_root.joinpath('data', 'cache')) f1 = pkg_root.joinpath('data', 'residential_roads.csv') f2 = pkg_root.joinpath('data', 'nonres_roads.csv') if f1.exists() and f2.exists(): lines_logger.info('Both roads files exists, not writing') return t = pd.concat([cache.get_df(e) for e in simplified_keys]) t = t[['zone', 'epsg', 'us_state', 'cus_state', 'highway', 'geometry']] residential_roads = t[t.highway == 'r'] nonres_roads = t[t.highway != 'r'] if not f1.exists(): residential_roads.to_csv(f1, index=False) if not f2.exists(): nonres_roads.to_csv(f2, index=False)
def f_simplify_lines(cache_dir, key): cache = FileCache(cache_dir) if not key: return [] try: df = cache.get_df(key) except EOFError as e: raise LPError(f"Failed to load key {key}: {e}") except AttributeError as e: raise LPError(f"Failed to load key {key}: {e}") okeys = [] for idx, g in df.groupby('epsg'): _, fn = key.split('/') okey = f'simplified/{idx}/{fn}' if not cache.exists(okey): geometry = g.to_crs(epsg=idx).geometry \ .simplify(20, False) \ .to_crs(4326) \ .apply(lambda e: shapely.wkt.dumps(e, rounding_precision=0)) g = pd.DataFrame(g).assign(geometry=geometry) cache.put_df(okey, g) okeys.append(okey) return okeys
def __init__(self, pkg): self.pkg = pkg self.pkg_root = Path(self.pkg.path).parent self.cache = FileCache(self.pkg_root.joinpath('data', 'cache')) # self.cp = CensusProcessor(self.cache, 2019, progress_bar=True) # self.tp = TractProcessor(self.cache) self._continental_states = None self._tracts = None self._gtracts = None self._geo_to_tract = None self._ghdf = None self._overlay = None self._zones = None self.ea_epsg = 2163 # US Equal Area projection
def load_group_frames(pkg, group_name): frames = {} pkg_root = Path(pkg.path).parent cache = FileCache(pkg_root.joinpath('cache')) for r in tqdm(group_resources(pkg, group_name)): if r.name not in frames: ck = f'frames/{r.name}' if not cache.exists(ck): df = r.dataframe() frames[r.name] = df cache.put_df(ck, pd.DataFrame(df)) else: frames[r.name] = cache.get_df(ck) return frames
class ExtractManager(object): def __init__(self, pkg, cache=None): self.pkg = pkg self.pkg_root = Path(self.pkg.path).parent self._df = None self._agg_map = None if cache is None: self._cache = FileCache(self.pkg_root.joinpath('data', 'cache')) else: self._cache = cache @property def table_code_map(self): "Map from census table codes to friendlier names" return { c.props.get('tablecode'): c.name for c in self.pkg.resource('census_set').schema_term.find( 'Table.Column') if c.props.get('tablecode') } @property def agg_map(self): if self._agg_map is None: _ = self.census_set # Also creates the agg_map return self._agg_map def update_schema(self): pkg = mp.open_package( self.pkg.ref ) # Re-open in case it has changed since loaded in this notebook for c in pkg.resource('combined').schema_term.find('Table.Column'): if not c.description: c.description = self.column_map.get(c.name.upper()) pkg.write() @property def column_map(self): # Gets created in base_census_df return self._cache.get('base_census_df_cm') @property def base_census_df(self): k = 'base_census_df' kcm = 'base_census_df_cm' if not self._cache.exists(k) or not self._cache.exists(kcm): logger.info('Collect frames') frames = [ r.dataframe().drop(columns=['stusab', 'county', 'name']) for r in tqdm(self.pkg.references()) if r.name.startswith('B') ] # Need to do this here b/c we need the CensusDataFrame objects kv = list( filter( col_f, chain( * [list(e for e in e.title_map.items()) for e in frames]))) column_map = {k: munge(v) for k, v in kv} logger.info('Assemble frames into dataset') df = reduce(lambda left, right: left.join(right), frames[1:], frames[0]) self._cache.put_df(k, df) self._cache.put(kcm, column_map) return df else: return self._cache.get(k) @property def census_set(self): if self._df is None: df = self.base_census_df # get rid of the margin columns m90_col = [c for c in df.columns if c.endswith('m90')] df = df.drop(columns=m90_col) logger.info('Make aggregate map') rows = [] for acol, scols in aggregates.items(): df[acol] = df.loc[:, scols].sum(axis=1) for c in scols: rows.append((acol, c, self.column_map[c.upper()])) self._agg_map = pd.DataFrame( rows, columns=['agg_column', 'source_col', 'description']) df = df.reset_index() iq = self.pkg.reference('income_quartiles').dataframe() df = df.merge(iq.set_index('geoid'), on='geoid').fillna(0) agg = self.pkg.reference('aggregate_income').dataframe().drop( columns=['households']) df = df.merge(agg.set_index('geoid'), on='geoid').fillna(0) # Rename non-agregated columns to nicer names df = df.rename(columns=self.table_code_map) cols = get_columns( self.pkg) # Select only the columns described in the schema self._df = df.replace({'': 0}).fillna(0)[cols] return self._df outputs = ('census_set', 'agg_map') def build(self, force=False, clean=False): dd = self.pkg_root.joinpath('data') if clean: self._cache.clean() if not dd.exists(): dd.mkdir(parents=True, exist_ok=True) for o in self.outputs: p = dd.joinpath(o).with_suffix('.csv') if not p.exists() or force: logger.info(f"Creating {o}{' (forcing)' if force else ''}") d = getattr(self, o) logger.info(f"Write {o}") d.to_csv(p, index=False) else: logger.info(f"{o} already exists")
def get_cache(pkg): return FileCache(Path(pkg.path).parent.joinpath('data', 'cache'))