def f_run_overlay(cache_dir, key, okey):
    cache = FileCache(cache_dir)

    if cache.exists(okey):
        return okey

    t = cache.get_df(key)

    utm = cache.get_df('utm_grid')

    t = t[t.highway.isin(list(hw_type.keys()))]
    t['highway'] = t.highway.replace(hw_type)  # Cuts file size by 100M
    t['geometry'] = t.geometry.apply(shapely.wkt.loads)

    if len(t) == 0:
        return None

    gdf = gpd.GeoDataFrame(t, crs=4326)
    try:
        t = gpd.overlay(gdf, utm)

        try:
            cache.put_df(okey, t)
        except:
            if cache.exists(okey):
                cache.delete(key)
                raise
    except IndexError as e:

        raise LPError(
            f"Failed for {key} gdf:{len(gdf)} hashes:{len(utm)}: {e}")
    return okey
    def __init__(self, pkg, cache=None):
        self.pkg = pkg
        self.pkg_root = Path(self.pkg.path).parent

        self._df = None
        self._agg_map = None

        if cache is None:
            self._cache = FileCache(self.pkg_root.joinpath('data', 'cache'))
        else:
            self._cache = cache
def write_files(pkg, simplified_keys):
    pkg_root = Path(pkg.path).parent
    cache = FileCache(pkg_root.joinpath('data', 'cache'))

    f1 = pkg_root.joinpath('data', 'residential_roads.csv')
    f2 = pkg_root.joinpath('data', 'nonres_roads.csv')

    if f1.exists() and f2.exists():
        lines_logger.info('Both roads files exists, not writing')
        return

    t = pd.concat([cache.get_df(e) for e in simplified_keys])
    t = t[['zone', 'epsg', 'us_state', 'cus_state', 'highway', 'geometry']]
    residential_roads = t[t.highway == 'r']
    nonres_roads = t[t.highway != 'r']

    if not f1.exists():
        residential_roads.to_csv(f1, index=False)

    if not f2.exists():
        nonres_roads.to_csv(f2, index=False)
def f_simplify_lines(cache_dir, key):
    cache = FileCache(cache_dir)

    if not key:
        return []

    try:
        df = cache.get_df(key)
    except EOFError as e:
        raise LPError(f"Failed to load key {key}: {e}")
    except AttributeError as e:
        raise LPError(f"Failed to load key {key}: {e}")

    okeys = []

    for idx, g in df.groupby('epsg'):
        _, fn = key.split('/')
        okey = f'simplified/{idx}/{fn}'

        if not cache.exists(okey):
            geometry = g.to_crs(epsg=idx).geometry \
                .simplify(20, False) \
                .to_crs(4326) \
                .apply(lambda e: shapely.wkt.dumps(e, rounding_precision=0))
            g = pd.DataFrame(g).assign(geometry=geometry)

            cache.put_df(okey, g)

        okeys.append(okey)

    return okeys
    def __init__(self, pkg):
        self.pkg = pkg

        self.pkg_root = Path(self.pkg.path).parent

        self.cache = FileCache(self.pkg_root.joinpath('data', 'cache'))

        # self.cp = CensusProcessor(self.cache, 2019, progress_bar=True)
        # self.tp = TractProcessor(self.cache)

        self._continental_states = None
        self._tracts = None
        self._gtracts = None
        self._geo_to_tract = None
        self._ghdf = None
        self._overlay = None
        self._zones = None

        self.ea_epsg = 2163  # US Equal Area projection
def load_group_frames(pkg, group_name):

    frames = {}

    pkg_root = Path(pkg.path).parent
    cache = FileCache(pkg_root.joinpath('cache'))
    
    for r in tqdm(group_resources(pkg, group_name)):
        if r.name not in frames:
            ck = f'frames/{r.name}'
            if not cache.exists(ck):
                df = r.dataframe()
                frames[r.name] = df
                cache.put_df(ck, pd.DataFrame(df))
            else:
                frames[r.name] = cache.get_df(ck)
                
    return frames
class ExtractManager(object):
    def __init__(self, pkg, cache=None):
        self.pkg = pkg
        self.pkg_root = Path(self.pkg.path).parent

        self._df = None
        self._agg_map = None

        if cache is None:
            self._cache = FileCache(self.pkg_root.joinpath('data', 'cache'))
        else:
            self._cache = cache

    @property
    def table_code_map(self):
        "Map from census table codes to friendlier names"
        return {
            c.props.get('tablecode'): c.name
            for c in self.pkg.resource('census_set').schema_term.find(
                'Table.Column') if c.props.get('tablecode')
        }

    @property
    def agg_map(self):

        if self._agg_map is None:
            _ = self.census_set  # Also creates the agg_map

        return self._agg_map

    def update_schema(self):
        pkg = mp.open_package(
            self.pkg.ref
        )  # Re-open in case it has changed since loaded in this notebook

        for c in pkg.resource('combined').schema_term.find('Table.Column'):
            if not c.description:
                c.description = self.column_map.get(c.name.upper())

        pkg.write()

    @property
    def column_map(self):
        # Gets created in base_census_df
        return self._cache.get('base_census_df_cm')

    @property
    def base_census_df(self):

        k = 'base_census_df'
        kcm = 'base_census_df_cm'

        if not self._cache.exists(k) or not self._cache.exists(kcm):
            logger.info('Collect frames')
            frames = [
                r.dataframe().drop(columns=['stusab', 'county', 'name'])
                for r in tqdm(self.pkg.references()) if r.name.startswith('B')
            ]

            # Need to do this here b/c we need the CensusDataFrame objects
            kv = list(
                filter(
                    col_f,
                    chain(
                        *
                        [list(e for e in e.title_map.items())
                         for e in frames])))
            column_map = {k: munge(v) for k, v in kv}

            logger.info('Assemble frames into dataset')
            df = reduce(lambda left, right: left.join(right), frames[1:],
                        frames[0])
            self._cache.put_df(k, df)
            self._cache.put(kcm, column_map)
            return df
        else:
            return self._cache.get(k)

    @property
    def census_set(self):

        if self._df is None:

            df = self.base_census_df

            # get rid of the margin columns
            m90_col = [c for c in df.columns if c.endswith('m90')]
            df = df.drop(columns=m90_col)

            logger.info('Make aggregate map')
            rows = []
            for acol, scols in aggregates.items():
                df[acol] = df.loc[:, scols].sum(axis=1)

                for c in scols:
                    rows.append((acol, c, self.column_map[c.upper()]))

            self._agg_map = pd.DataFrame(
                rows, columns=['agg_column', 'source_col', 'description'])

            df = df.reset_index()

            iq = self.pkg.reference('income_quartiles').dataframe()
            df = df.merge(iq.set_index('geoid'), on='geoid').fillna(0)

            agg = self.pkg.reference('aggregate_income').dataframe().drop(
                columns=['households'])
            df = df.merge(agg.set_index('geoid'), on='geoid').fillna(0)

            # Rename non-agregated columns to nicer names
            df = df.rename(columns=self.table_code_map)

            cols = get_columns(
                self.pkg)  # Select only the columns described in the schema
            self._df = df.replace({'': 0}).fillna(0)[cols]

        return self._df

    outputs = ('census_set', 'agg_map')

    def build(self, force=False, clean=False):

        dd = self.pkg_root.joinpath('data')

        if clean:
            self._cache.clean()

        if not dd.exists():
            dd.mkdir(parents=True, exist_ok=True)

        for o in self.outputs:
            p = dd.joinpath(o).with_suffix('.csv')
            if not p.exists() or force:
                logger.info(f"Creating {o}{' (forcing)' if force else ''}")
                d = getattr(self, o)
                logger.info(f"Write {o}")
                d.to_csv(p, index=False)
            else:
                logger.info(f"{o} already exists")
Example #8
0
def get_cache(pkg):
    return FileCache(Path(pkg.path).parent.joinpath('data', 'cache'))