def clip(db, in_table, clip_table, out_table): """Clip geometry of in_table by clip_table, writing output to out_table """ columns = ["a." + c for c in db[in_table].columns if c != 'geom'] db[out_table].drop() sql = """CREATE UNLOGGED TABLE {temp} AS SELECT {columns}, CASE WHEN ST_CoveredBy(a.geom, b.geom) THEN a.geom ELSE ST_Multi( ST_CollectionExtract( ST_Intersection(a.geom,b.geom), 3)) END AS geom FROM {in_table} AS a INNER JOIN {clip_table} AS b ON ST_Intersects(a.geom, b.geom) """.format( temp=out_table, columns=", ".join(columns), in_table=in_table, clip_table=clip_table, ) util.log( 'Clipping %s by %s to create %s' % (in_table, clip_table, out_table) ) db.execute(sql)
def overlay(in_file, in_layer, dump_file, new_layer_name): """Intersect layer with designatedlands """ # load in_file to postgres db = pgdata.connect(config['db_url'], schema="public") if not in_layer: in_layer = fiona.listlayers(in_file)[0] if not new_layer_name: new_layer_name = in_layer[:63] # Maximum table name length is 63 out_layer = new_layer_name[:50] + "_overlay" db.ogr2pg(in_file, in_layer=in_layer, out_layer=new_layer_name) # pull distinct tiles iterable into a list tiles = [t for t in db["tiles"].distinct('map_tile')] # uncomment and adjust for debugging a specific tile # tiles = [t for t in tiles if t[:4] == '092K'] util.log("Intersecting %s with %s" % (config['out_table'], new_layer_name)) geoutil.intersect( db, config['out_table'], new_layer_name, out_layer, config['n_processes'], tiles, ) # dump result to file if dump_file: util.log("Dumping intersect to file %s " % config['out_file']) dump(out_layer, config['out_file'], config['out_format'])
def clean_and_agg_sources(db, source_csv, alias=None, force=False): """ After sources are tiled and preprocessed, aggregation and cleaning is helpful to reduce topology exceptions in further processing. This is separate from the tiling / preprocessing because non-aggregated outputs (with the source designation name and id) are required. """ sources = util.read_csv(source_csv) # process only the source layer specified if alias: sources = [s for s in sources if s['alias'] == alias] # for all designated lands sources: # - create new table name prefixed with c_<hierarchy> # - aggregate by designation, tile clean_sources = [ s for s in sources if s["exclude"] != 'T' and s['hierarchy'] != 0 ] for source in clean_sources: if source["cleaned_table"] not in db.tables or force: util.log("Cleaning and aggregating: %s" % source["alias"]) db[source["cleaned_table"]].drop() lookup = { "out_table": source["cleaned_table"], "src_table": source["tiled_table"], } sql = db.build_query(db.queries["prep2_clean_agg"], lookup) db.execute(sql)
def tile_sources(db, source_csv, alias=None, force=False): """ - merge/union data within sources - cut sources by tile - repair source geom - add required columns """ sources = util.read_csv(source_csv) # process only the source layer specified if alias: sources = [s for s in sources if s['alias'] == alias] # for all designated lands sources: # - create new table name prefixed with b_<hierarchy> # - create and populate standard columns: # - designation (equivalent to source's alias in sources.csv) # - designation_id (unique id of source feature) # - designation_name (name of source feature) tile_sources = [ s for s in sources if s["exclude"] != 'T' and s['hierarchy'] != 0 ] for source in tile_sources: if source["tiled_table"] not in db.tables or force: util.log("Tiling and validating: %s" % source["alias"]) db[source["tiled_table"]].drop() lookup = { "out_table": source["tiled_table"], "src_table": source["input_table"], "designation_id_col": source["designation_id_col"], "designation_name_col": source["designation_name_col"], } sql = db.build_query(db.queries["prep1_merge_tile_a"], lookup) db.execute(sql)
def create_bc_boundary(db, n_processes): """ Create a comprehensive land-marine layer by combining three sources. Note that specificly named source layers are hard coded and must exist: - bc_boundary_land (BC boundary layer from GeoBC, does not include marine) - bc_abms (BC Boundary, ABMS) - marine_ecosections (BC Marine Ecosections) """ # create land/marine definition table db.execute(db.queries['create_bc_boundary']) # Prep boundary sources # First, combine ABMS boundary and marine ecosections db["bc_boundary_marine"].drop() db.execute( """CREATE TABLE a00_bc_boundary_marine AS SELECT 'bc_boundary_marine' as designation, ST_Union(geom) as geom FROM (SELECT st_union(geom) as geom FROM a00_bc_abms UNION ALL SELECT st_union(geom) as geom FROM a00_marine_ecosections) as foo GROUP BY designation""" ) for source in ["a00_bc_boundary_land", "a00_bc_boundary_marine"]: util.log('Prepping and inserting into bc_boundary: %s' % source) # subdivide before attempting to tile db["temp_" + source].drop() db.execute( """CREATE UNLOGGED TABLE temp_{t} AS SELECT ST_Subdivide(geom) as geom FROM {t}""".format(t=source) ) db["temp_" + source].create_index_geom() # tile db[source + "_tiled"].drop() lookup = { "src_table": "temp_" + source, "out_table": source + "_tiled" } db.execute(db.build_query(db.queries["prep1_merge_tile_b"], lookup)) db["temp_" + source].drop() # combine the boundary layers into new table bc_boundary sql = db.build_query( db.queries["populate_output"], {"in_table": source + "_tiled", "out_table": "bc_boundary"}, ) tiles = get_tiles(db, source + "_tiled", "tiles") func = partial(parallel_tiled, db.url, sql) pool = multiprocessing.Pool(processes=n_processes) pool.map(func, tiles) pool.close() pool.join() # rename the 'designation' column db.execute( """ALTER TABLE bc_boundary RENAME COLUMN designation TO bc_boundary""" )
def download_non_bcgw(url, path, filename, layer=None, force_download=False): """ Download and extract a zipfile to unique location Modified from https://github.com/OpenBounds/Processing/blob/master/utils.py """ # create a unique name for downloading and unzipping, this ensures a given # url will only get downloaded once out_folder = os.path.join(path, hashlib.sha224(url.encode('utf-8')).hexdigest()) out_file = os.path.join(out_folder, filename) if force_download and os.path.exists(out_folder): shutil.rmtree(out_folder) if not os.path.exists(out_folder): util.log('Downloading ' + url) parsed_url = urlparse(url) urlfile = parsed_url.path.split('/')[-1] _, extension = os.path.split(urlfile) fp = tempfile.NamedTemporaryFile('wb', suffix=extension, delete=False) if parsed_url.scheme == "http" or parsed_url.scheme == "https": res = requests.get(url, stream=True, verify=False) if not res.ok: raise IOError for chunk in res.iter_content(CHUNK_SIZE): fp.write(chunk) elif parsed_url.scheme == "ftp": download = urllib.request.urlopen(url) file_size_dl = 0 block_sz = 8192 while True: buffer = download.read(block_sz) if not buffer: break file_size_dl += len(buffer) fp.write(buffer) fp.close() # extract zipfile unzip_dir = util.make_sure_path_exists(out_folder) util.log('Extracting %s to %s' % (fp.name, unzip_dir)) zipped_file = get_compressed_file_wrapper(fp.name) zipped_file.extractall(unzip_dir) zipped_file.close() # get layer name if not layer: layer = fiona.listlayers(os.path.join(out_folder, filename))[0] return (out_file, layer)
def union(db, in_table, columns, out_table): """Union/merge overlapping records with equivalent values for provided columns """ db[out_table].drop() sql = """CREATE UNLOGGED TABLE {temp} AS SELECT {columns}, (ST_Dump(ST_Union(geom))).geom as geom FROM {in_table} GROUP BY {columns} """.format( temp=out_table, columns=columns, in_table=in_table ) util.log( 'Unioning geometries in %s by %s to create %s' % (in_table, columns, out_table) ) db.execute(sql)
def preprocess(db, source_csv, alias=None, force=False): """ Preprocess sources as specified in source_csv Supported operations: - clip - union """ sources = util.read_csv(source_csv) # process only the source layer specified if alias: sources = [s for s in sources if s['alias'] == alias] preprocess_sources = [ s for s in sources if s["preprocess_operation"] != '' ] for source in preprocess_sources: if source["input_table"] + "_preprc" not in db.tables or force: util.log("Preprocessing: %s" % source["alias"]) if source['preprocess_operation'] not in ['clip', 'union']: raise ValueError( 'Preprocess operation %s not supprted' % source['preprocess_operation'] ) # prefix clip layer name with 'a00', only non tiled, non hierarchy # clip layers are suppported if source['preprocess_operation'] == 'clip': source["preprocess_args"] = "a00_" + source["preprocess_args"] # call the specified preprocess function globals()[source["preprocess_operation"]]( db, source["input_table"], source['preprocess_args'], source["input_table"] + "_preprc", ) # overwrite the tiled table with the preprocessed table, but # retain the _preprc table as a flag that the job is done db[source["input_table"]].drop() db.execute( """CREATE TABLE {t} AS SELECT * FROM {temp} """.format(t=source["input_table"], temp=source["input_table"] + "_preprc") ) # re-create spatial index db[source["input_table"]].create_index_geom()
def dump(overlaps, aggregate): """Dump output designatedlands table to file """ if aggregate: if overlaps: util.log('ignoring --overlaps flag') geoutil.dump_aggregate(config, 'designatedlands_agg') else: if overlaps: config['out_table'] = config['out_table'] + '_overlaps' db = pgdata.connect(config["db_url"], schema="public") util.log('Dumping %s to %s' % (config['out_table'], config['out_file'])) columns = [ c for c in db[config['out_table']].columns if c != 'geom' and 'prelim' not in c ] ogr_sql = """SELECT {cols}, st_collectionextract(st_safe_repair(st_snaptogrid(geom, .001)), 3) as geom FROM {t} WHERE designation IS NOT NULL """.format(cols=",".join(columns), t=config['out_table']) util.log(ogr_sql) db = pgdata.connect(config["db_url"]) db.pg2ogr( ogr_sql, config['out_format'], config['out_file'], config['out_table'], geom_type="MULTIPOLYGON", )
def download_bcgw(url, dl_path, email, force_download=False): """Download BCGW data using bcdata/DWDS """ # derive databc package name from the url package = os.path.split(urlparse(url).path)[1] # get schema/table from DataBC API package_info = bcdata.package_show(package) object_name = package_info['object_name'] schema = object_name.split('.')[0] table = object_name.split('.')[1] # dwds download naming is consistent out_gdb = table + '.gdb' out_folder = os.path.join(dl_path, out_gdb) layer = schema + '_' + table if force_download and os.path.exists(out_folder): shutil.rmtree(out_folder) if not os.path.exists(out_folder): util.log('Downloading %s' % package) download = bcdata.download(package, email) if not download: raise Exception("Failed to download " + package) shutil.copytree(download, out_folder) return (out_folder, layer)
def create_db(): """Create a fresh database """ util.log('Creating database %s' % config['db_url']) pgdata.create_db(config["db_url"]) db = pgdata.connect(config["db_url"]) db.execute("CREATE EXTENSION IF NOT EXISTS postgis") # the pgxn extension does not work on windows # note to the user to add lostgis functions manually with provided # .bat file as a reference if os.name == 'posix': db.execute("CREATE EXTENSION IF NOT EXISTS lostgis") else: util.log( 'Remember to add required lostgis functions to your new database', level=30, ) util.log('See scripts\lostgis_windows.bat as a guide', level=30)
def process(resume, force_preprocess, tiles): """Create output designatedlands tables """ db = pgdata.connect(config["db_url"], schema="public") # run required preprocessing, tile, attempt to clean inputs geoutil.preprocess(db, config['source_csv'], force=force_preprocess) geoutil.tile_sources(db, config['source_csv'], force=force_preprocess) geoutil.clean_and_agg_sources(db, config['source_csv'], force=force_preprocess) # parse the list of tiles tilelist = geoutil.parse_tiles(db, tiles) # create target tables if not resuming from a bailed process if not resume: # create output tables db.execute( db.build_query( db.queries["create_outputs_prelim"], {"table": config['out_table']}, )) # filter sources - use only non-exlcuded sources with hierarchy > 0 sources = [ s for s in util.read_csv(config['source_csv']) if s['hierarchy'] != 0 and s["exclude"] != 'T' ] # To create output table with overlaps, combine all source data # (tiles argument does not apply, we could build a tile query string but # it seems unnecessary) for source in sources: util.log("Inserting %s into preliminary output overlap table" % source["tiled_table"]) sql = db.build_query( db.queries["populate_output_overlaps"], { "in_table": source["tiled_table"], "out_table": config['out_table'] + "_overlaps_prelim", }, ) db.execute(sql) # To create output table with no overlaps, more processing is required # In case of bailing during tests/development, `resume` option is available # to enable resumption of processing at specified hierarchy number if resume: p_sources = [s for s in sources if int(s["hierarchy"]) >= int(resume)] else: p_sources = sources # The tiles layer will fill in gaps between sources (so all BC is included # in output). To do this, first match schema of tiles to other sources db.execute("ALTER TABLE tiles ADD COLUMN IF NOT EXISTS id integer") db.execute("UPDATE tiles SET id = tile_id") db.execute("ALTER TABLE tiles ADD COLUMN IF NOT EXISTS designation text") # Next, add simple tiles layer definition to sources list p_sources.append({"cleaned_table": "tiles", "category": None}) # iterate through all sources for source in p_sources: sql = db.build_query( db.queries["populate_output"], { "in_table": source["cleaned_table"], "out_table": config['out_table'] + "_prelim", }, ) # determine which specified tiles are present in source layer src_tiles = set( geoutil.get_tiles(db, source["cleaned_table"], tile_table='tiles')) if tilelist: tiles = set(tilelist) & src_tiles else: tiles = src_tiles if tiles: util.log("Inserting %s into preliminary output table" % source["cleaned_table"]) # for testing, run only one process and report on tile if config['n_processes'] == 1: for tile in tiles: util.log(tile) db.execute(sql, (tile + "%", ) * 2) else: func = partial(geoutil.parallel_tiled, db.url, sql, n_subs=2) pool = multiprocessing.Pool(processes=config['n_processes']) pool.map(func, tiles) pool.close() pool.join() else: util.log("No tiles to process") # create marine-terrestrial layer if 'bc_boundary' not in db.tables: geoutil.create_bc_boundary(db, config['n_processes']) # overlay output tables with marine-terrestrial definition for table in [config['out_table'], config['out_table'] + "_overlaps"]: util.log('Cutting %s with marine-terrestrial definition' % table) geoutil.intersect( db, table + "_prelim", "bc_boundary", table, config['n_processes'], tiles, ) tidy_designations(db, sources, "cleaned_table", config['out_table']) tidy_designations(db, sources, "tiled_table", config['out_table'] + "_overlaps")
def load(alias, force_download): """Download data, load to postgres """ db = pgdata.connect(config["db_url"]) sources = util.read_csv(config["source_csv"]) # filter sources based on optional provided alias and ignoring excluded if alias: sources = [s for s in sources if s["alias"] == alias] if not sources: raise ValueError('Alias %s does not exist' % alias) sources = [s for s in sources if s["exclude"] != 'T'] # process sources where automated downloads are avaiable load_commands = [] for source in [s for s in sources if s["manual_download"] != 'T']: # handle BCGW downloads if urlparse(source["url"]).hostname == 'catalogue.data.gov.bc.ca': file, layer = download.download_bcgw( source["url"], config["dl_path"], email=config["email"], force_download=force_download, ) # handle all other downloads (zipfiles only) else: file, layer = download.download_non_bcgw( source['url'], config['dl_path'], source['file_in_url'], source['layer_in_file'], force_download=force_download, ) load_commands.append( db.ogr2pg( file, in_layer=layer, out_layer=source["input_table"], sql=source["query"], cmd_only=True, )) # process manually downloaded sources for source in [s for s in sources if s["manual_download"] == 'T']: file = os.path.join(config['dl_path'], source["file_in_url"]) if not os.path.exists(file): raise Exception(file + " does not exist, download it manually") load_commands.append( db.ogr2pg( file, in_layer=source['layer_in_file'], out_layer=source["input_table"], sql=source["query"], cmd_only=True, )) # run all ogr commands in parallel util.log('Loading source data to database.') # https://stackoverflow.com/questions/14533458/python-threading-multiple-bash-subprocesses processes = [subprocess.Popen(cmd, shell=True) for cmd in load_commands] for p in processes: p.wait() # log ogr statements for debugging #for cmd in load_commands: # util.log(cmd) # subprocess.call(cmd, shell=True) # create tiles layer util.log('Creating tiles layer') db.execute(db.queries["create_tiles"])
def dump_aggregate(config, new_layer_name): """ UNSUPPORTED test aggregation of designatedlands over tile boundaries Output data is aggregated across map tiles to remove gaps introduced in tiling of the sources. Aggregation is by distinct 'designation' in the output layer, and is run separately for each designation for speed. To dump data aggregated by 'category' or some other field, build and run your own ogr2ogr command based on below queries. This command is unsupported, aggregation does not quite remove gaps across all records and is very slow. Use mapshaper to aggregate outputs from the dump command (convert to shapefile first) eg: $ mapshaper designatedlands.shp \ -clean snap-interval=0.01 \ -dissolve designatio copy-fields=category \ -explode \ -o dl_clean.shp """ # config = util.read_config(config) db = pgdata.connect(config["db_url"], schema="public") util.log('Aggregating %s to %s' % (config['out_table'], new_layer_name)) # find all non-null designations designations = [ d for d in db[config['out_table']].distinct('designation') if d ] db[new_layer_name].drop() sql = """CREATE TABLE {new_layer_name} AS SELECT designation, category, bc_boundary, geom FROM {out_table} LIMIT 0""".format( new_layer_name=new_layer_name, out_table=config['out_table'] ) db.execute(sql) # iterate through designations to speed up the aggregation for designation in designations: util.log('Adding %s to %s' % (designation, new_layer_name)) # dump records entirely within a tile sql = """ INSERT INTO {new_layer_name} (designation, category, bc_boundary, geom) SELECT dl.designation, dl.category, dl.bc_boundary, dl.geom FROM {t} dl INNER JOIN tiles ON dl.map_tile = tiles.map_tile WHERE dl.designation = %s AND ST_Coveredby(dl.geom, ST_Buffer(tiles.geom, -.01)) """.format( t=config['out_table'], new_layer_name=new_layer_name ) db.execute(sql, (designation,)) # aggregate cross-tile records # Notes: # - expansion/contraction buffering of 3mm to remove gaps between tiles # - ST_Buffer of 0 on ST_Collect is much faster than ST_Union # - ST_Collect is a bit less robust, it requires # ST_RemoveRepeatedPoints to complete successfully on sources that # appear to come from rasters (mineral_reserve, ogma_legal) sql = """ INSERT INTO {new_layer_name} (designation, category, bc_boundary, geom) SELECT designation, category, bc_boundary, (ST_Dump(ST_Buffer(geom, -.003))).geom as geom FROM ( SELECT dl.designation, dl.category, dl.bc_boundary, ST_Buffer( ST_RemoveRepeatedPoints( ST_SnapToGrid( ST_Collect( ST_Buffer(dl.geom, .003)), .001), .01), 0) as geom FROM designatedlands dl INNER JOIN tiles ON dl.map_tile = tiles.map_tile WHERE dl.designation = %s AND NOT ST_Coveredby(dl.geom, ST_Buffer(tiles.geom, -.01)) GROUP BY dl.designation, dl.category, dl.bc_boundary) as foo """.format( t=config['out_table'], new_layer_name=new_layer_name ) db.execute(sql, (designation,)) util.log('Dumping %s to file %s' % (new_layer_name, config['out_file'])) db.pg2ogr( "SELECT * from " + new_layer_name, config['out_format'], config['out_file'], new_layer_name, )
def intersect( db, in_table, intersect_table, out_table, n_processes, tiles=None ): """ Intersect in_table with intersect_table, creating out_table Inputs may not have equivalently named columns """ # examine the inputs to determine what columns should be in the output in_columns = [Column(c.name, c.type) for c in db[in_table].sqla_columns] intersect_columns = [ Column(c.name, c.type) for c in db[intersect_table].sqla_columns if c.name not in ['geom', 'map_tile'] ] # make sure output column names are unique, removing geom and map_tile from # the list as they are hard coded into the query in_names = set( [ c.name for c in in_columns if c.name != 'geom' and c.name != 'map_tile' ] ) intersect_names = set([c.name for c in intersect_columns]) # test for non-unique columns in input (other than map_tile and geom) non_unique_columns = in_names.intersection(intersect_names) if non_unique_columns: util.log( 'Column(s) found in both sources: %s' % ",".join(non_unique_columns) ) raise Exception("Input column names must be unique") # create output table db[out_table].drop() # add primary key pk = Column(out_table + "_id", Integer, primary_key=True) pgdata.Table( db, "public", out_table, [pk] + in_columns + intersect_columns ) # populate the output table if 'map_tile' not in [c.name for c in db[intersect_table].sqla_columns]: query = "intersect_inputtiled" tile_table = "tiles" sql = db.build_query( db.queries[query], { "in_table": in_table, "in_columns": ", ".join(in_names), "intersect_table": intersect_table, "intersect_columns": ", ".join(intersect_names), "out_table": out_table, "tile_table": tile_table, }, ) else: query = "intersect_alltiled" tile_table = None sql = db.build_query( db.queries[query], { "in_table": in_table, "in_columns": ", ".join(in_names), "intersect_table": intersect_table, "intersect_columns": ", ".join(intersect_names), "out_table": out_table, }, ) if not tiles: tiles = get_tiles(db, intersect_table, "tiles") func = partial(parallel_tiled, db.url, sql) pool = multiprocessing.Pool(processes=n_processes) # add a progress bar results_iter = pool.imap_unordered(func, tiles) with click.progressbar(results_iter, length=len(tiles)) as bar: for _ in bar: pass # pool.map(func, tiles) pool.close() pool.join() # delete any records with empty geometries in the out table db.execute( """DELETE FROM {t} WHERE ST_IsEmpty(geom) = True """.format(t=out_table) ) # add map_tile index to output db.execute( """CREATE INDEX {t}_tileix ON {t} (map_tile text_pattern_ops) """.format(t=out_table) )