def transform(source: str, input_filename: str, schema_filename: str, output_filename: str): """ :param source: "cod" or "gadm" """ config = parse_yaml('config.yml') if source == "cod": df_adm0 = gpd.read_file(f'zip://{input_filename}') schema_mapping = {'admin0Name_en': 'name_en'} elif source == "gadm": df_adm0 = gpd.read_file( f'zip://{input_filename}!{GADM_FILENAME.format(ISO3=config["constants"]["ISO3"])}', layer=GADM_LAYER.format(ISO3=config['constants']['ISO3'])) schema_mapping = {'NAME_0': 'name_en'} # Change CRS df_adm0 = df_adm0.to_crs(config['constants']['crs']) # Modify the column names to suit the schema df_adm0 = df_adm0.rename(columns=schema_mapping) # Make columns needed for validation df_adm0['geometry_type'] = df_adm0['geometry'].apply(lambda x: x.geom_type) df_adm0['crs'] = df_adm0.crs # Validate validate(instance=df_adm0.to_dict('list'), schema=parse_yaml(schema_filename)) # Write to output df_adm0.to_file(output_filename)
def extract_osm_query(): osm_url = sys.argv[1] #"http://overpass-api.de/api/interpreter?" country = sys.argv[2] #'YE' osm_schema = parse_yaml(sys.argv[3]) #parse_yaml('schemas/osm_tags_lakes.yml') geom_type = parse_yaml(sys.argv[3])['geom_type'] osm_output_file = sys.argv[4] #'raw_data/osm_rivers_pol.xml' gpkg_output_file = sys.argv[5] #'raw_data/osm_rivers_pol.gpkg' get_osm_xml(osm_url, osm_query(osm_schema, country), osm_output_file) convert_osm2gpkg(osm_output_file,gpkg_output_file,geom_type)
def transform(): config = parse_yaml('config.yml') held_gpkg = os.path.join(config['dirs']['raw_data'], config['surrounding']['gadm']['raw']) country_aoi = config['constants']['ISO3'] schema_filename = os.path.join(config['dirs']['schemas'], config['surrounding']['schema']) output_filename = os.path.join(config['dirs']['processed_data'], config['surrounding']['gadm']['processed']) # Unzip - as reading zipped world geopackage takes too long rawdir = config['dirs']['raw_data'] zipgpkg = config['surrounding']['gadm']['rawzip'] source_gadm_world = os.path.join(rawdir, zipgpkg) print(r'Unzipping {0} to {1}'.format(source_gadm_world, held_gpkg)) gadmzip = zipfile.ZipFile(source_gadm_world, 'r') gadmzip.extractall(rawdir) gadmzip.close() # Check unzip was ok? print(f'Reading {held_gpkg}') for layername in fiona.listlayers(held_gpkg): print(f'Reading {layername} into Geopandas. Takes about 2 mins...') print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")) gdf = gpd.read_file(held_gpkg, layer=layername) print(r'Done reading.') print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")) # Process the country of interest gdf_aoi_levels = gadm_gpkg_processing.get_country_admin_levels( gdf, country_aoi) # Get neighbours neighbours = gadm_gpkg_processing.get_neighbour_countries( gdf, gdf_A0=gdf_aoi_levels['a0']) # Remove country of interest from neighbours list if required. #if country_aoi in neighbours: neighbours.remove(country_aoi) print(r'Found {0}'.format(','.join(neighbours))) # Process all neighbour countries A0_list = [] for neighbour in neighbours: gdf_aoi_levels = gadm_gpkg_processing.get_country_admin_levels( gdf, neighbour) A0_list.append(gdf_aoi_levels['a0']) # Concatenate neighbours A0 gemetries into single GeoDataFrame gdf_A0_all = gpd.GeoDataFrame(pd.concat(A0_list, ignore_index=True)) # Redefine definition gdf_A0_all.crs = {'init': 'epsg:4326'} # Reproject gdf_A0_all = gdf_A0_all.to_crs(config['constants']['crs']) # Apply schema gdf_A0_all = gdf_A0_all.rename(columns={'NAME_0': 'name_en'}) # Make columns needed for validation gdf_A0_all['geometry_type'] = gdf_A0_all['geometry'].apply( lambda x: x.geom_type) gdf_A0_all['crs'] = gdf_A0_all.crs # Validate validate(instance=gdf_A0_all.to_dict('list'), schema=parse_yaml(schema_filename)) # Write to output gdf_A0_all.to_file(output_filename)
def transform_geoboundaries(): config = parse_yaml('config.yml') rawdir = os.path.join(config['dirs']['raw_data'], config['geoboundaries']['subfolder']) iso = config['constants']['ISO3'] isofield = config['geoboundaries']['isofield'] schema_filename = os.path.join( config['dirs']['schemas'], config['surrounding']['schema']) output_filename = os.path.join( config['dirs']['processed_data'], config['surrounding']['geoboundaries']['processed']) dfs = [] for root, dirs, files in os.walk(rawdir): for fileName in files: if fileName.endswith(".zip"): # Unzip forUnzip, ext = os.path.splitext(fileName) source = os.path.join(root, fileName) print(f'Processing {source}') try: geobndzip = zipfile.ZipFile(source, 'r') except Exception as err: print(err) continue unzipped = os.path.join(root, forUnzip) geobndzip.extractall(unzipped) # Capture GeoJson for fName in os.listdir(unzipped): if fName.endswith(".geojson"): df = gpd.read_file(os.path.join(unzipped,fName)) dfs.append(df) # Delete dir shutil.rmtree(unzipped) # Merge into single total GeoBoundaries ADM0 dataset gdf_source = gpd.GeoDataFrame(pd.concat(dfs, ignore_index=True)) # get AOI from this dataset gdf_aoi = gdf_source.loc[gdf_source[isofield] == iso] # Check + concat + process neighbours neighbours = get_neighbours_generic(gdf_source, gdf_aoi, isofield) dfs_ngb = [] for ngb in neighbours: dfs_ngb.append(gdf_source.loc[gdf_source[isofield] == ngb]) gdf_ngb = gpd.GeoDataFrame(pd.concat(dfs_ngb, ignore_index=True)) # Redefine definition gdf_ngb.crs = {'init': 'epsg:4326'} # Reproject gdf_ngb = gdf_ngb.to_crs(config['constants']['crs']) # Apply schema gdf_ngb = gdf_ngb.rename(columns={'shapeName':'name_en'}) # Make columns needed for validation gdf_ngb['geometry_type'] = gdf_ngb['geometry'].apply(lambda x: x.geom_type) gdf_ngb['crs'] = gdf_ngb.crs validate( instance=gdf_ngb.to_dict('list'),schema=parse_yaml(schema_filename)) gdf_ngb.to_file(output_filename)
def transform(source: str, input_filename: str, schema_filename: str, output_filename: str): """ :param source: "cod" or "osm" """ print(source, input_filename, schema_filename, output_filename) config = parse_yaml('config.yml') if source == "osm": df_roads = gpd.read_file(input_filename) # df_roads = convert_osm_to_gpkg(input_filename, 'osm_roads.gpkg', 'lines') schema_mapping = { 'name:en': 'name_en', 'name': 'name_loc', 'highway': 'fclass' } # GDAL converts OSM to GPKG, tags are written as hstore key-value in attribute 'other_tags' # method to convert hstore string to dictionary from SqlAlchemy hstore = HSTORE.result_processor(None, None, 'string') df_roads['other_tags']=df_roads['other_tags'].apply(hstore) for key, value in schema_mapping.items(): # temp dictionary for pandas rename method. Don't use original dict as want to see whether # each input attribute is present. temp_schema_dict = {key: value} try: # rename column if exists. df_roads = df_roads.rename(columns=temp_schema_dict, errors="raise") except: # as error raised, input attribute is not present. # now make sure output attribute is NOT present. If not pull from 'other_tags' if value not in df_roads.columns: df_roads[value] = df_roads['other_tags'].apply(lambda x: x.get(key) if type(x) == dict else x) # now remove columns which aren't in schema: schema_to_keep = list(schema_mapping.values()) # add geometry to schema schema_to_keep.append('geometry') df_roads = df_roads.filter(schema_to_keep) elif source == "cod": df_roads = gpd.read_file(f'zip://{input_filename}') # COD data has some NAs df_roads = df_roads[df_roads['geometry'].notna()] schema_mapping = {'TYPE': 'fclass'} # Rename columns using schema_mapping df_roads = df_roads.rename(columns=schema_mapping) # TODO need to convert from XML to GPKG rather than OSM to GPKG # Change CRS df_roads = df_roads.to_crs(config['constants']['crs']) # Make columns needed for validation ### df_roads['geometry_type'] = df_roads['geometry'].apply(lambda x: x.geom_type) ### df_roads['crs'] = df_roads.crs # Validate ### validate(instance=df_roads.to_dict('list'), schema=parse_yaml(schema_filename)) # Write to output df_roads.to_file(output_filename,encoding='utf8')
def get_all_adm0(previous=None): config = parse_yaml('config.yml') url = "{0}?{1}".format(config['geoboundaries']['url'], config['geoboundaries']['all']) rawdir = os.path.join(config['dirs']['raw_data'], config['geoboundaries']['subfolder']) Path(rawdir).mkdir(parents=True, exist_ok=True) geodb_api_response = get_json(url) df = pd.DataFrame(geodb_api_response) if previous: # Load the previous scan/download # Check the date - if previous is available pass else: # Use latest CSV with format 'gbd_YYYYMMDD.csv' # Process the DataFrame to remove all dates that are up to date. # If no latest CSV found download the whole lot for index, row in df.iterrows(): # Download the zip (to a GeoBoundaries ADM0 folder) # Unzip is part of the transform process. # Use transform/adm0.py ? dl = row['downloadURL'] extra, fName = os.path.split(row['downloadURL']) outpath = os.path.join(rawdir, fName) print(r'Downloading {0} to {1}'.format(dl, outpath)) download_url(dl, outpath) # Log the output details now = datetime.datetime.now().strftime("%Y%m%d") df.to_csv(os.path.join(rawdir, "gbd_{0}.csv".format(now)), index=False)
def get_geoboundaries_adm(admLevel=None): config = parse_yaml('config.yml') rooturl = config['geoboundaries']['url'] iso = config['constants']['ISO3'] typ = config['geoboundaries']['typ'] # Get Levels url = "{0}?ISO={1}&{2}".format(rooturl, iso, typ) knownlevels = len(get_json(url)) # Process Data if admLevel is not None: url = r"{0}&ADM={1}".format(url, admLevel) # We want all available admin boundaries so skip specifying which AMD to # process. Otherwise include a specific ADM level to process. geob_api_response = get_json(url) # Check whether there may be additional admin levels available if len(geob_api_response) > knownlevels and admLevel is None: print('Note. Additional Admin levels may have been added.') # Can check the JSON for all admin level on the fly. print('\tConfirm with {0}'.format(url)) print('\tCheck output dir {0}'.format(config['dirs']['raw_data'])) # Get the download urls from the api response for level in geob_api_response: adm = level[config['geoboundaries']['boundaryType_api_key']].lower() rawdir = config['dirs']['raw_data'] raw = r'{0}{1}.zip'.format(config['geoboundaries']['raw'], adm) dl = level[config['geoboundaries']['downloadURL_api_key']] outpath = os.path.join(rawdir, raw) Path(rawdir).mkdir(parents=True, exist_ok=True) print(r'Downloading {0} to {1}'.format(dl, outpath)) download_url(dl, outpath)
def get_world(): config = parse_yaml('config.yml') rawdir = config['dirs']['raw_data'] outputZip = config['surrounding']['gadm']['rawzip'] sourceURL = config['surrounding']['gadm']['url'] source_gadm_world = os.path.join(rawdir, outputZip) print(r'Downloading {0} to {1}'.format(sourceURL, source_gadm_world)) download_url(sourceURL, source_gadm_world)
def transform(source: str, input_filename: str, schema_filename: str, output_filename: str): """ :param source: "cod" or "gadm" """ config = parse_yaml('config.yml') if source == "cod": layerlist = fiona.listlayers(f'zip://{input_filename}') search = 'adm3' for sublist in layerlist: if search in sublist: with fiona.open(f'zip://{input_filename}', layer=sublist) as layer: for feature in layer: if feature['geometry']['type'] == 'MultiPolygon': # print(feature['geometry']['type'],sublist) adm3 = sublist # print(adm3) index = layerlist.index(adm3) adm3_name = layerlist[index] df_adm3 = gpd.read_file(f'zip://{input_filename}', layer=adm3_name) schema_mapping = {'admin3Name_en': 'name_en'} elif source == "gadm": df_adm3 = gpd.read_file( f'zip://{input_filename}!{GADM_FILENAME.format(ISO3=config["constants"]["ISO3"])}', layer=GADM_LAYER.format(ISO3=config['constants']['ISO3'])) schema_mapping = {'NAME_3': 'name_en'} # Change CRS df_adm3 = df_adm3.to_crs(config['constants']['crs']) # Modify the column names to suit the schema df_adm3 = df_adm3.rename(columns=schema_mapping) # Make columns needed for validation df_adm3['geometry_type'] = df_adm3['geometry'].apply(lambda x: x.geom_type) df_adm3['crs'] = df_adm3.crs # Validate validate(instance=df_adm3.to_dict('list'), schema=parse_yaml(schema_filename)) # Write to output df_adm3.to_file(output_filename)
def transform(source: str, input_filename: str, schema_filename: str, output_filename: str): """ :param source: "cod" or "gadm" """ config = parse_yaml('config.yml') if source == "cod": df_seaports = gpd.read_file(f'zip://{input_filename}') # Change CRS df_seaports = df_seaports[df_seaports['geometry'].notna()] df_seaports = df_seaports.to_crs(config['constants']['crs']) # Make columns needed for validation df_seaports['geometry_type'] = df_seaports['geometry'].apply( lambda x: x.geom_type) df_seaports['crs'] = df_seaports.crs # Validate validate(instance=df_seaports.to_dict('list'), schema=parse_yaml(schema_filename)) # Write to output df_seaports.to_file(output_filename)
def transform(source: str, input_filename: str, schema_filename: str, output_filename: str): """ :param source: "cod" or "gadm" """ config = parse_yaml('config.yml') if source == "cod": layerlist = fiona.listlayers(f'zip://{input_filename}') search = 'adm1' for sublist in layerlist: if search in sublist: with fiona.open(f'zip://{input_filename}', layer=sublist) as layer: for feature in layer: if feature['geometry']['type'] == 'MultiPolygon': # print(feature['geometry']['type'],sublist) adm1 = sublist # print(adm1) index = layerlist.index(adm1) adm1_name = layerlist[index] df_adm1 = gpd.read_file(f'zip://{input_filename}', layer=adm1_name) schema_mapping = { 'admin1Name_en': 'name_en' } elif source == "gadm": df_adm1 = gpd.read_file(f'zip://{input_filename}!{GADM_FILENAME.format(ISO3=config["constants"]["ISO3"])}', layer=GADM_LAYER.format(ISO3=config['constants']['ISO3'])) schema_mapping = { 'NAME_1': 'name_en', 'GID_1': 'pcode', 'GID_0': 'par_pcode' } elif source == "geoboundaries": rawdir = config['dirs']['raw_data'] source_geob = os.path.join(rawdir, config['geoboundaries']['adm1']['raw']) unzipped, ext = os.path.splitext(source_geob) # Unzip geobndzip = zipfile.ZipFile(source_geob, 'r') geobndzip.extractall(unzipped) geobndzip.close() # Find geojson geojson = [] for root, dirs, files in os.walk(unzipped): for filename in files: if filename.endswith(".geojson"): geojson.append(os.path.join(root, filename)) if len(geojson) > 1: print('Found more than one geojson file in {0}'.format(unzipped)) elif len(geojson) == 0: print('Found no geojson files in {0}'.format(unzipped)) else: df_adm1 = gpd.read_file(geojson[0]) schema_mapping = {'shapeName': 'name_en'} # Change CRS df_adm1 = df_adm1.to_crs(config['constants']['crs']) # Modify the column names to suit the schema df_adm1 = df_adm1.rename(columns=schema_mapping) # Make columns needed for validation df_adm1['geometry_type'] = df_adm1['geometry'].apply(lambda x: x.geom_type) df_adm1['crs'] = df_adm1.crs # Validate validate(instance=df_adm1.to_dict('list'), schema=parse_yaml(schema_filename)) # Write to output df_adm1.to_file(output_filename)
def adm_to_line(inputDir: str, schemaFile: str, iso3: str, supplier: str): #config = parse_yaml(r'J:\git\datasources-etl\config.yml') # config = parse_yaml('config.yml') # relative to the root path, where # # snakemake is. # Ensure user knows to pass 'supplier' param if supplier == 'None' or supplier is None: print('Please provide Supplier param') print("snakemake transform_internal_boundaries --config supplier='<supplier> --cores 1'") sys.exit(0) else: print(f'Processing {supplier} admin boundaries for internal dividing lines.') print('! NOTE. There are currently known issues with this process.') print('! Please check the output manually for consistency.') print('! More details are in this wiki page:') print('! https://wiki.mapaction.org/display/orgdev/Boundaries') print('! See Section: Internal Boundaries') print('\n') # capture all relevant admin files files = get_files(inputDir, '.shp') admShps = [] for fName in files: if re.match( rf".*{iso3.lower()}_admn_ad[1-9]_py_s[0-9]_{supplier}_pp.shp$", fName, re.I): admShps.append(fName) for inputFile in admShps: outputFile = inputFile.replace('_py_s0_','_ln_s0_') df_adm = gpd.read_file(inputFile, encoding='utf-8') df_borders = df_adm.copy() # Note. The following could be within lamdba values. Unpacking into # separate function if further refinement of the changes are necessary. df_borders['borders'] = df_borders.apply( check_borders,df_borders=df_borders,border_name='name_en',axis=1) df_borders['new_rows'] = df_borders.apply(make_new_rows, axis=1) # Combine the rows from each new_rows column into a new dataframe # Assumes the following simple data schema. # 'name_en_1','name_en_2' These reflect the names of the admin areas on # either side of the line processed. df_new = gpd.GeoDataFrame(columns=['name_1_en', 'name_2_en', 'geometry']) for _, row in df_borders.iterrows(): df_new = df_new.append(row['new_rows'], ignore_index=True) # Update local names - assumes source schema includes appropriate # colums. name_1_local df_new = pd.merge( df_new, df_adm[['name_en', 'name_local']], left_on='name_1_en', right_on='name_en', how='inner') df_new.rename(columns={'name_local': 'name_1_loc'}, inplace=True) df_new.drop(['name_en'], axis=1, inplace=True) # name_2_local df_new = pd.merge( df_new, df_adm[['name_en', 'name_local']], left_on='name_2_en', right_on='name_en', how='inner') df_new.rename(columns={'name_local': 'name_2_loc'}, inplace=True) df_new.drop(['name_en'], axis=1, inplace=True) # not interested in point intersections df_new = df_new[~df_new.geometry.type.isin(['Point', 'MultiPoint'])] # NOTE. The process below can result in errors. This may be due to the # order in which a MultiLineString is defined. Checking for errors # (ie. Using ArcGIS Desktop - Check / Repair Geometry) # Convert MultiLineString to just LineStrings # Issues where MultiLineString is not converted - as is the case with # YEM GADM files: # https://gis.stackexchange.com/questions/223447/weld-individual-line-segments-into-one-linestring-using-shapely df_new['geometry'] = df_new['geometry'].apply( lambda x: ops.linemerge(x) if x.geom_type == 'MultiLineString' else x) # not interested in point intersections df_new = df_new[~df_new.geometry.type.isin(['Point', 'MultiPoint'])] # now separate out MultiLineString features df_new = df_new.explode() # Define projection to be the same as the source - which should be # specified in the config.yml file. df_new.crs = df_borders.crs # Make additional columns needed for validation df_new['geometry_type'] = df_new['geometry'].apply(lambda x: x.geom_type) # Validatetwo sec try: validate(instance=df_new.to_dict('list'), schema=parse_yaml(schemaFile)) except Exception as err: print(err) else: # Write to output df_new.to_file(outputFile, encoding='utf-8') print("Done.")