def build_block_maps(pkg):
    cache = get_cache(pkg)
    states = list(stusab.values())

    grid_key = 'blocks/map/source/utm'
    cache.put(grid_key, pkg.reference('utm_grid').geoframe())

    cbsa_key = 'blocks/map/source/cbsa'
    cache.put(cbsa_key, pkg.reference('cbsa').geoframe().to_crs(4326))

    tasks = [(cache, st, grid_key, cbsa_key) for st in states]

    try:
        import appnope
        with appnope.nope_scope():
            r = run_mp(_f_block_maps, tasks)
    except ImportError:
        r = run_mp(_f_block_maps, tasks)

    cbsa_map = pd.concat([cache.get(e[0]) for e in r])\

    utm_map = pd.concat([cache.get(e[1]) for e in r])

    pkg_root = Path(pkg.path).parent
    cbsa_map.to_csv(pkg_root.joinpath('data', 'cbsa_map.csv'), index=False)
    utm_map.to_csv(pkg_root.joinpath('data', 'utm_map.csv'), index=False)
def build_clusters(pkg):

    cache = get_cache(pkg)

    cluster_logger.info('Caching source points by CBSA')
    cache_points_cbsa(pkg, cache)
    cache_lines_cbsa(pkg, cache)

    cluster_logger.info('Start MP run')
    tasks = [(cache, e.stem)
             for e in cache.list('clusters/source_points/cbsa')]
    r = run_mp(run_cbsa_clusters, tasks)

    cluster_logger.info('Assemble metro points')
    metro_point_keys = [k1 for k1, k2 in r if not isinstance(k1, Exception)]
    frames = [cache.get(k) for k in tqdm(metro_point_keys)]
    metro_points = pd.concat(frames)

    cluster_logger.info('Assemble clusters')
    cluster_keys = [k2 for k1, k2 in r if not isinstance(k1, Exception)]
    frames = [cache.get(k) for k in tqdm(cluster_keys) if cache.exists(k)]
    clusters = pd.concat(frames)

    cluster_logger.info('Write files')
    pkg_root = Path(pkg.path).parent
    metro_points.to_csv(pkg_root.joinpath('data', 'metro_points.csv'),
                        index=False)
    clusters.to_csv(pkg_root.joinpath('data', 'business_clusters.csv'),
                    index=False)
def split_blocks(pkg):
    """Download block files and cache them"""
    cache = get_cache(pkg)

    states = list(stusab.values())

    keys = run_mp(_f_get_split_blocks, [(st, cache, pkg.reference('block_templ').url.format(st=st))
                                        for st in states], n_cpu=4)  # 4 cpu b/c we're downloading

    return keys
    def get_tags_df(self, nrows=None, force=False):
        """Extract all of the OSM points records that have useful other_tags,
        add geohashes, and write the file to the cache"""

        tqdm.pandas()  # Add progress_apply()

        key = self.osm_key + '/tags'

        if self.cache.exists(key) and not force:
            return self.cache.get_df(key)

        p = self.cache.joinpath(self.csv_key, 'points.csv')

        logger.debug('Loading points file')
        df = pd.read_csv(p, low_memory=False, nrows=nrows)

        logger.debug('Generate tasks')
        tasks = [(e, self.extract_tags) for e in np.array_split(df, 200)]

        results = run_mp(OsmProcessor.do_extract_tags, tasks,
                         'Split OSM other_tags')
        self.tags = list(chain(*[e[0] for e in results]))
        self.errors = list(chain(*[e[1] for e in results]))

        logger.debug('Create tags df')
        tags_df = pd.DataFrame(self.tags,
                               columns=['osm_id'] + self.extract_tags)

        # 1/2 the entries, 2.7M are trees and rocks
        logger.debug('Remove trees and rock')
        tags_df = tags_df[~tags_df.natural.isin(['tree', 'rock'])]

        logger.debug('Merge geometry')
        tags_df = pd.merge(tags_df, df[['osm_id', 'geometry']], on='osm_id')

        def encode(v):
            return gh.encode(*list(map(float, v[7:-1].split()))[::-1])

        logger.debug('Add geohash')
        tags_df['geohash'] = tags_df.geometry.progress_apply(encode)

        logger.debug('Convert to geopandas')
        tags_df['geometry'] = tags_df.geometry.progress_apply(
            shapely.wkt.loads)

        tags_df = gpd.GeoDataFrame(tags_df, geometry='geometry', crs=4326)

        logger.debug('Write to file')
        self.cache.put_df(key, tags_df)

        return tags_df
def join_blocks(pkg, break_starts):
    """Join census blocks and OSM points"""

    cache = get_cache(pkg)

    states = list(stusab.values())

    tasks = list(e + (cache,) for e in product(break_starts, states))

    keys = run_mp(_f_join_blocks, tasks)

    joins = [e for e in keys if not isinstance(e, Exception)]
    exn = [e for e in keys if isinstance(e, Exception)]

    return joins
def simplify_lines(pkg, recombine_keys):
    cache = get_cache(pkg)

    try:
        # Returned the cached keys if this is already done
        return cache.get('simplified/simplified_keys')
    except KeyError:
        pass

    simplified_keys = run_mp(f_simplify_lines,
                             [(cache.root, e) for e in recombine_keys],
                             desc='Simplify')

    simplified_keys = list(chain(*simplified_keys))

    cache.put('simplified/simplified_keys', simplified_keys)

    return simplified_keys
Beispiel #7
0
def make_tags_df(pkg):
    """Create the tags dataframe"""
    cache = open_cache(pkg)

    points_logger.info('Make tags dataframe')

    try:
        tags_df = cache.get_df('points/tags_df')
    except KeyError:
        points_df = pkg.reference('points').read_csv(low_memory=False)

        # Split the file and extract tags in multiprocessing
        N_task = 200
        tasks = [(e, extract_tag_names)
                 for e in np.array_split(points_df, N_task)]

        results = run_mp(extract_tags, tasks, 'Split OSM other_tags')
        tags = list(chain(*[e[0] for e in results]))
        errors = list(chain(*[e[1] for e in results]))

        tags_df = pd.DataFrame(tags, columns=['osm_id'] + extract_tag_names)

        # 1/2 the entries, 2.7M are trees and rocks
        tags_df = tags_df[~tags_df.natural.isin(['tree', 'rock'])]

        tags_df = pd.merge(tags_df,
                           points_df[['osm_id', 'geometry']],
                           on='osm_id')

        def encode(v):
            return gh.encode(*list(map(float, v[7:-1].split()))[::-1])

        tags_df['geohash'] = tags_df.geometry.progress_apply(encode)

        tags_df['geometry'] = tags_df.geometry.progress_apply(
            shapely.wkt.loads)

        tags_df = gpd.GeoDataFrame(tags_df, geometry='geometry', crs=4326)

        cache.put_df('points/tags_df', tags_df)

    return tags_df
def run_overlay(pkg, splits, force=False):
    cache = get_cache(pkg)

    if not force:
        try:
            # Returned the cached keys if this is already done
            recombine_keys = cache.get('recombine/recombine_keys')

            if len(recombine_keys) == len(splits):
                return recombine_keys

        except KeyError:
            pass

    tasks = [[cache.root, e, ro_key(e)] for e in splits]

    recombine_keys = run_mp(f_run_overlay, tasks, desc='Overlay Geohash')

    cache.put('recombine/recombine_keys', recombine_keys)

    return list(filter(bool, recombine_keys))