Python parse_yaml Examples, utils.yaml_api.parse_yaml Python Examples

Example #1

0

Show file

File: adm0.py Project: dgreenslade/datasources-etl

def transform(source: str, input_filename: str, schema_filename: str,
              output_filename: str):
    """
    :param source: "cod" or "gadm"
    """
    config = parse_yaml('config.yml')
    if source == "cod":
        df_adm0 = gpd.read_file(f'zip://{input_filename}')
        schema_mapping = {'admin0Name_en': 'name_en'}
    elif source == "gadm":
        df_adm0 = gpd.read_file(
            f'zip://{input_filename}!{GADM_FILENAME.format(ISO3=config["constants"]["ISO3"])}',
            layer=GADM_LAYER.format(ISO3=config['constants']['ISO3']))
        schema_mapping = {'NAME_0': 'name_en'}
    # Change CRS
    df_adm0 = df_adm0.to_crs(config['constants']['crs'])
    # Modify the column names to suit the schema
    df_adm0 = df_adm0.rename(columns=schema_mapping)
    # Make columns needed for validation
    df_adm0['geometry_type'] = df_adm0['geometry'].apply(lambda x: x.geom_type)
    df_adm0['crs'] = df_adm0.crs
    # Validate
    validate(instance=df_adm0.to_dict('list'),
             schema=parse_yaml(schema_filename))
    # Write to output
    df_adm0.to_file(output_filename)

Example #2

0

Show file

def extract_osm_query():
    osm_url = sys.argv[1] #"http://overpass-api.de/api/interpreter?"
    country = sys.argv[2] #'YE'
    osm_schema = parse_yaml(sys.argv[3]) #parse_yaml('schemas/osm_tags_lakes.yml')
    geom_type = parse_yaml(sys.argv[3])['geom_type']
    osm_output_file = sys.argv[4] #'raw_data/osm_rivers_pol.xml'
    gpkg_output_file = sys.argv[5] #'raw_data/osm_rivers_pol.gpkg'
    get_osm_xml(osm_url, osm_query(osm_schema, country), osm_output_file)
    convert_osm2gpkg(osm_output_file,gpkg_output_file,geom_type)

Example #3

0

Show file

def transform():
    config = parse_yaml('config.yml')
    held_gpkg = os.path.join(config['dirs']['raw_data'],
                             config['surrounding']['gadm']['raw'])
    country_aoi = config['constants']['ISO3']
    schema_filename = os.path.join(config['dirs']['schemas'],
                                   config['surrounding']['schema'])
    output_filename = os.path.join(config['dirs']['processed_data'],
                                   config['surrounding']['gadm']['processed'])
    # Unzip - as reading zipped world geopackage takes too long
    rawdir = config['dirs']['raw_data']
    zipgpkg = config['surrounding']['gadm']['rawzip']
    source_gadm_world = os.path.join(rawdir, zipgpkg)
    print(r'Unzipping {0} to {1}'.format(source_gadm_world, held_gpkg))
    gadmzip = zipfile.ZipFile(source_gadm_world, 'r')
    gadmzip.extractall(rawdir)
    gadmzip.close()
    # Check unzip was ok?
    print(f'Reading {held_gpkg}')
    for layername in fiona.listlayers(held_gpkg):
        print(f'Reading {layername} into Geopandas. Takes about 2 mins...')
        print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
        gdf = gpd.read_file(held_gpkg, layer=layername)
        print(r'Done reading.')
        print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
    # Process the country of interest
    gdf_aoi_levels = gadm_gpkg_processing.get_country_admin_levels(
        gdf, country_aoi)
    # Get neighbours
    neighbours = gadm_gpkg_processing.get_neighbour_countries(
        gdf, gdf_A0=gdf_aoi_levels['a0'])
    # Remove country of interest from neighbours list if required.
    #if country_aoi in neighbours: neighbours.remove(country_aoi)
    print(r'Found {0}'.format(','.join(neighbours)))
    # Process all neighbour countries
    A0_list = []
    for neighbour in neighbours:
        gdf_aoi_levels = gadm_gpkg_processing.get_country_admin_levels(
            gdf, neighbour)
        A0_list.append(gdf_aoi_levels['a0'])
    # Concatenate neighbours A0 gemetries into single GeoDataFrame
    gdf_A0_all = gpd.GeoDataFrame(pd.concat(A0_list, ignore_index=True))
    # Redefine definition
    gdf_A0_all.crs = {'init': 'epsg:4326'}
    # Reproject
    gdf_A0_all = gdf_A0_all.to_crs(config['constants']['crs'])
    # Apply schema
    gdf_A0_all = gdf_A0_all.rename(columns={'NAME_0': 'name_en'})
    # Make columns needed for validation
    gdf_A0_all['geometry_type'] = gdf_A0_all['geometry'].apply(
        lambda x: x.geom_type)
    gdf_A0_all['crs'] = gdf_A0_all.crs
    # Validate
    validate(instance=gdf_A0_all.to_dict('list'),
             schema=parse_yaml(schema_filename))
    # Write to output
    gdf_A0_all.to_file(output_filename)

Example #4

0

Show file

File: adm0_surround.py Project: simran212530/datasources-etl

def transform_geoboundaries():
    config = parse_yaml('config.yml')
    rawdir = os.path.join(config['dirs']['raw_data'],
                          config['geoboundaries']['subfolder'])
    iso = config['constants']['ISO3']
    isofield = config['geoboundaries']['isofield']
    schema_filename =  os.path.join(
            config['dirs']['schemas'], config['surrounding']['schema'])
    output_filename = os.path.join(
            config['dirs']['processed_data'], 
            config['surrounding']['geoboundaries']['processed'])
    dfs = []
    for root, dirs, files in os.walk(rawdir):
        for fileName in files:
            if fileName.endswith(".zip"):
                # Unzip
                forUnzip, ext = os.path.splitext(fileName)
                source = os.path.join(root, fileName)
                print(f'Processing {source}')
                try:
                    geobndzip = zipfile.ZipFile(source, 'r')
                except Exception as err:
                    print(err)
                    continue
                unzipped = os.path.join(root, forUnzip)
                geobndzip.extractall(unzipped)
                # Capture GeoJson
                for fName in os.listdir(unzipped):
                    if fName.endswith(".geojson"):
                        df = gpd.read_file(os.path.join(unzipped,fName))
                        dfs.append(df)
                # Delete dir
                shutil.rmtree(unzipped)    
    # Merge into single total GeoBoundaries ADM0 dataset
    gdf_source = gpd.GeoDataFrame(pd.concat(dfs, ignore_index=True))
    # get AOI from this dataset
    gdf_aoi = gdf_source.loc[gdf_source[isofield] == iso]
    # Check + concat + process neighbours
    neighbours = get_neighbours_generic(gdf_source, gdf_aoi, isofield)
    dfs_ngb = []
    for ngb in neighbours:
        dfs_ngb.append(gdf_source.loc[gdf_source[isofield] == ngb])
    gdf_ngb = gpd.GeoDataFrame(pd.concat(dfs_ngb, ignore_index=True))
    # Redefine definition
    gdf_ngb.crs = {'init': 'epsg:4326'}
    # Reproject
    gdf_ngb = gdf_ngb.to_crs(config['constants']['crs'])
    # Apply schema 
    gdf_ngb = gdf_ngb.rename(columns={'shapeName':'name_en'}) 
    # Make columns needed for validation
    gdf_ngb['geometry_type'] = gdf_ngb['geometry'].apply(lambda x: x.geom_type)
    gdf_ngb['crs'] = gdf_ngb.crs
    validate(
        instance=gdf_ngb.to_dict('list'),schema=parse_yaml(schema_filename))
    gdf_ngb.to_file(output_filename)

Example #5

0

Show file

def transform(source: str, input_filename: str, schema_filename: str, output_filename: str):
    """
    :param source: "cod" or "osm"
    """
    print(source, input_filename, schema_filename, output_filename)
    config = parse_yaml('config.yml')

    if source == "osm":

        df_roads = gpd.read_file(input_filename)
        # df_roads = convert_osm_to_gpkg(input_filename, 'osm_roads.gpkg', 'lines')
        schema_mapping = {
            'name:en': 'name_en',
            'name': 'name_loc',
            'highway': 'fclass'
            }
        # GDAL converts OSM to GPKG, tags are written as hstore key-value in attribute 'other_tags'
        # method to convert hstore string to dictionary from SqlAlchemy
        hstore = HSTORE.result_processor(None, None, 'string')
        df_roads['other_tags']=df_roads['other_tags'].apply(hstore)

        for key, value in schema_mapping.items():
            # temp dictionary for pandas rename method. Don't use original dict as want to see whether
            # each input attribute is present.
            temp_schema_dict = {key: value}
            try:
                # rename column if exists.
                df_roads = df_roads.rename(columns=temp_schema_dict, errors="raise")
            except:
                # as error raised, input attribute is not present.
                # now make sure output attribute is NOT present.  If not pull from 'other_tags'
                if value not in df_roads.columns:
                    df_roads[value] = df_roads['other_tags'].apply(lambda x: x.get(key) if type(x) == dict else x)

        # now remove columns which aren't in schema:
        schema_to_keep = list(schema_mapping.values())
        # add geometry to schema
        schema_to_keep.append('geometry')
        df_roads = df_roads.filter(schema_to_keep)


    elif source == "cod":
        df_roads = gpd.read_file(f'zip://{input_filename}')

        # COD data has some NAs
        df_roads = df_roads[df_roads['geometry'].notna()]
        schema_mapping = {'TYPE': 'fclass'}
        # Rename columns using schema_mapping
        df_roads = df_roads.rename(columns=schema_mapping)

# TODO need to convert from XML to GPKG rather than OSM to GPKG
    # Change CRS
    df_roads = df_roads.to_crs(config['constants']['crs'])
    # Make columns needed for validation
    ### df_roads['geometry_type'] = df_roads['geometry'].apply(lambda x: x.geom_type)
    ### df_roads['crs'] = df_roads.crs
    # Validate
    ### validate(instance=df_roads.to_dict('list'), schema=parse_yaml(schema_filename))
    # Write to output
    df_roads.to_file(output_filename,encoding='utf8')

Example #6

0

Show file

File: geoboundaries.py Project: simran212530/datasources-etl

def get_all_adm0(previous=None):
    config = parse_yaml('config.yml')
    url = "{0}?{1}".format(config['geoboundaries']['url'],
                           config['geoboundaries']['all'])
    rawdir = os.path.join(config['dirs']['raw_data'],
                          config['geoboundaries']['subfolder'])
    Path(rawdir).mkdir(parents=True, exist_ok=True)
    geodb_api_response = get_json(url)
    df = pd.DataFrame(geodb_api_response)
    if previous:
        # Load the previous scan/download 
        # Check the date - if previous is available
        pass  
    else:
        # Use latest CSV with format 'gbd_YYYYMMDD.csv' 
        # Process the DataFrame to remove all dates that are up to date.
        # If no latest CSV found download the whole lot
        for index, row in df.iterrows():
            # Download the zip (to a GeoBoundaries ADM0 folder)
            # Unzip is part of the transform process. 
            # Use transform/adm0.py ? 
            dl = row['downloadURL']
            extra, fName = os.path.split(row['downloadURL'])
            outpath = os.path.join(rawdir, fName)
            print(r'Downloading {0} to {1}'.format(dl, outpath))
            download_url(dl, outpath)
        # Log the output details
        now = datetime.datetime.now().strftime("%Y%m%d")
        df.to_csv(os.path.join(rawdir, "gbd_{0}.csv".format(now)),
                  index=False)

Example #7

0

Show file

File: geoboundaries.py Project: simran212530/datasources-etl

def get_geoboundaries_adm(admLevel=None):
    config = parse_yaml('config.yml')
    rooturl = config['geoboundaries']['url']
    iso = config['constants']['ISO3']
    typ = config['geoboundaries']['typ']
    # Get Levels
    url = "{0}?ISO={1}&{2}".format(rooturl, iso, typ)
    knownlevels = len(get_json(url))
    # Process Data
    if admLevel is not None: url = r"{0}&ADM={1}".format(url, admLevel)
    # We want all available admin boundaries so skip specifying which AMD to
    # process. Otherwise include a specific ADM level to process.
    geob_api_response = get_json(url)
    # Check whether there may be additional admin levels available 
    if len(geob_api_response) > knownlevels and admLevel is None:
        print('Note. Additional Admin levels may have been added.')
        # Can check the JSON for all admin level on the fly.
        print('\tConfirm with {0}'.format(url))
        print('\tCheck output dir {0}'.format(config['dirs']['raw_data']))
    # Get the download urls from the api response
    for level in geob_api_response:
        adm = level[config['geoboundaries']['boundaryType_api_key']].lower()
        rawdir = config['dirs']['raw_data']
        raw = r'{0}{1}.zip'.format(config['geoboundaries']['raw'], adm)
        dl = level[config['geoboundaries']['downloadURL_api_key']]
        outpath = os.path.join(rawdir, raw)
        Path(rawdir).mkdir(parents=True, exist_ok=True)
        print(r'Downloading {0} to {1}'.format(dl, outpath))
        download_url(dl, outpath)

Example #8

0

Show file

File: gadm.py Project: simran212530/datasources-etl

def get_world():
    config = parse_yaml('config.yml')
    rawdir = config['dirs']['raw_data']
    outputZip = config['surrounding']['gadm']['rawzip']
    sourceURL = config['surrounding']['gadm']['url']
    source_gadm_world = os.path.join(rawdir, outputZip)
    print(r'Downloading {0} to {1}'.format(sourceURL, source_gadm_world))
    download_url(sourceURL, source_gadm_world)

Example #9

0

Show file

def transform(source: str, input_filename: str, schema_filename: str,
              output_filename: str):
    """
    :param source: "cod" or "gadm"
    """
    config = parse_yaml('config.yml')

    if source == "cod":
        layerlist = fiona.listlayers(f'zip://{input_filename}')
        search = 'adm3'
        for sublist in layerlist:
            if search in sublist:
                with fiona.open(f'zip://{input_filename}',
                                layer=sublist) as layer:
                    for feature in layer:
                        if feature['geometry']['type'] == 'MultiPolygon':
                            # print(feature['geometry']['type'],sublist)
                            adm3 = sublist
        # print(adm3)

        index = layerlist.index(adm3)
        adm3_name = layerlist[index]

        df_adm3 = gpd.read_file(f'zip://{input_filename}', layer=adm3_name)
        schema_mapping = {'admin3Name_en': 'name_en'}
    elif source == "gadm":
        df_adm3 = gpd.read_file(
            f'zip://{input_filename}!{GADM_FILENAME.format(ISO3=config["constants"]["ISO3"])}',
            layer=GADM_LAYER.format(ISO3=config['constants']['ISO3']))
        schema_mapping = {'NAME_3': 'name_en'}
    # Change CRS
    df_adm3 = df_adm3.to_crs(config['constants']['crs'])
    # Modify the column names to suit the schema
    df_adm3 = df_adm3.rename(columns=schema_mapping)
    # Make columns needed for validation
    df_adm3['geometry_type'] = df_adm3['geometry'].apply(lambda x: x.geom_type)
    df_adm3['crs'] = df_adm3.crs
    # Validate
    validate(instance=df_adm3.to_dict('list'),
             schema=parse_yaml(schema_filename))
    # Write to output
    df_adm3.to_file(output_filename)

Example #10

0

Show file

File: seaports.py Project: simran212530/datasources-etl

def transform(source: str, input_filename: str, schema_filename: str,
              output_filename: str):
    """
    :param source: "cod" or "gadm"
    """
    config = parse_yaml('config.yml')

    if source == "cod":
        df_seaports = gpd.read_file(f'zip://{input_filename}')

    # Change CRS
    df_seaports = df_seaports[df_seaports['geometry'].notna()]
    df_seaports = df_seaports.to_crs(config['constants']['crs'])
    # Make columns needed for validation
    df_seaports['geometry_type'] = df_seaports['geometry'].apply(
        lambda x: x.geom_type)
    df_seaports['crs'] = df_seaports.crs

    # Validate
    validate(instance=df_seaports.to_dict('list'),
             schema=parse_yaml(schema_filename))

    # Write to output
    df_seaports.to_file(output_filename)

Example #11

0

Show file

def transform(source: str, input_filename: str, schema_filename: str, output_filename: str):
    """
    :param source: "cod" or "gadm"
    """
    config = parse_yaml('config.yml')

    if source == "cod":
        layerlist = fiona.listlayers(f'zip://{input_filename}')
        search = 'adm1'
        for sublist in layerlist:
            if search in sublist:
                with fiona.open(f'zip://{input_filename}', layer=sublist) as layer:
                    for feature in layer:
                        if feature['geometry']['type'] == 'MultiPolygon':
                            # print(feature['geometry']['type'],sublist)
                            adm1 = sublist
        # print(adm1)

        index = layerlist.index(adm1)
        adm1_name = layerlist[index]

        df_adm1 = gpd.read_file(f'zip://{input_filename}', layer=adm1_name)
        schema_mapping = {
            'admin1Name_en': 'name_en'
        }
    elif source == "gadm":
        df_adm1 = gpd.read_file(f'zip://{input_filename}!{GADM_FILENAME.format(ISO3=config["constants"]["ISO3"])}',
                                layer=GADM_LAYER.format(ISO3=config['constants']['ISO3']))
        schema_mapping = {
            'NAME_1': 'name_en',
            'GID_1': 'pcode',
            'GID_0': 'par_pcode'
        }
    elif source == "geoboundaries":
        rawdir = config['dirs']['raw_data']
        source_geob = os.path.join(rawdir, config['geoboundaries']['adm1']['raw'])
        unzipped, ext = os.path.splitext(source_geob)
        # Unzip
        geobndzip = zipfile.ZipFile(source_geob, 'r')
        geobndzip.extractall(unzipped)
        geobndzip.close()
        # Find geojson
        geojson = []
        for root, dirs, files in os.walk(unzipped):
            for filename in files:
                if filename.endswith(".geojson"):
                    geojson.append(os.path.join(root, filename))
        if len(geojson) > 1:
            print('Found more than one geojson file in {0}'.format(unzipped))
        elif len(geojson) == 0:
            print('Found no geojson files in {0}'.format(unzipped))
        else:
            df_adm1 = gpd.read_file(geojson[0])
        schema_mapping = {'shapeName': 'name_en'}
    # Change CRS
    df_adm1 = df_adm1.to_crs(config['constants']['crs'])
    # Modify the column names to suit the schema
    df_adm1 = df_adm1.rename(columns=schema_mapping)
    # Make columns needed for validation
    df_adm1['geometry_type'] = df_adm1['geometry'].apply(lambda x: x.geom_type)
    df_adm1['crs'] = df_adm1.crs
    # Validate
    validate(instance=df_adm1.to_dict('list'), schema=parse_yaml(schema_filename))
    # Write to output
    df_adm1.to_file(output_filename)

Example #12

0

Show file

def adm_to_line(inputDir: str, schemaFile: str, iso3: str, supplier: str):
    #config = parse_yaml(r'J:\git\datasources-etl\config.yml')
    # config = parse_yaml('config.yml') # relative to the root path, where
    #                                   # snakemake is.

    # Ensure user knows to pass 'supplier' param
    if supplier == 'None' or supplier is None:
        print('Please provide Supplier param')
        print("snakemake transform_internal_boundaries --config supplier='<supplier> --cores 1'")
        sys.exit(0)
    else:
        print(f'Processing {supplier} admin boundaries for internal dividing lines.')

    print('! NOTE. There are currently known issues with this process.')
    print('! Please check the output manually for consistency.')
    print('! More details are in this wiki page:')
    print('! https://wiki.mapaction.org/display/orgdev/Boundaries')
    print('! See Section: Internal Boundaries')
    print('\n')

    # capture all relevant admin files
    files = get_files(inputDir, '.shp')
    admShps = []
    for fName in files:
        if re.match(
                rf".*{iso3.lower()}_admn_ad[1-9]_py_s[0-9]_{supplier}_pp.shp$",
                fName, re.I):
            admShps.append(fName)

    for inputFile in admShps:
        outputFile = inputFile.replace('_py_s0_','_ln_s0_')
        df_adm = gpd.read_file(inputFile, encoding='utf-8')
        df_borders = df_adm.copy()
        # Note. The following could be within lamdba values. Unpacking into 
        # separate function if further refinement of the changes are necessary.
        df_borders['borders'] = df_borders.apply(
                check_borders,df_borders=df_borders,border_name='name_en',axis=1)
        df_borders['new_rows'] = df_borders.apply(make_new_rows, axis=1)
        # Combine the rows from each new_rows column into a new dataframe
        # Assumes the following simple data schema.
        # 'name_en_1','name_en_2' These reflect the names of the admin areas on
        # either side of the line processed.
        df_new = gpd.GeoDataFrame(columns=['name_1_en', 'name_2_en', 'geometry'])
        for _, row in df_borders.iterrows():
            df_new = df_new.append(row['new_rows'], ignore_index=True)
        
        # Update local names - assumes source schema includes appropriate 
        # colums. name_1_local
        df_new = pd.merge(
                df_new, df_adm[['name_en', 'name_local']], 
                left_on='name_1_en', right_on='name_en', how='inner')
        df_new.rename(columns={'name_local': 'name_1_loc'}, inplace=True)
        df_new.drop(['name_en'], axis=1, inplace=True)
        # name_2_local
        df_new = pd.merge(
                df_new, df_adm[['name_en', 'name_local']],
                left_on='name_2_en', right_on='name_en', how='inner')
        df_new.rename(columns={'name_local': 'name_2_loc'}, inplace=True)
        df_new.drop(['name_en'], axis=1, inplace=True)

        # not interested in point intersections
        df_new = df_new[~df_new.geometry.type.isin(['Point', 'MultiPoint'])]

        # NOTE. The process below can result in errors. This may be due to the
        # order in which a MultiLineString is defined. Checking for errors
        # (ie. Using ArcGIS Desktop - Check / Repair Geometry)

        # Convert MultiLineString to just LineStrings
        # Issues where MultiLineString is not converted - as is the case with
        # YEM GADM files:
        # https://gis.stackexchange.com/questions/223447/weld-individual-line-segments-into-one-linestring-using-shapely
        df_new['geometry'] = df_new['geometry'].apply(
            lambda x: ops.linemerge(x)
            if x.geom_type == 'MultiLineString'
            else x)
        # not interested in point intersections
        df_new = df_new[~df_new.geometry.type.isin(['Point', 'MultiPoint'])]

        # now separate out MultiLineString features
        df_new = df_new.explode()

        # Define projection to be the same as the source - which should be
        # specified in the config.yml file. 
        df_new.crs = df_borders.crs
        # Make additional columns needed for validation
        df_new['geometry_type'] = df_new['geometry'].apply(lambda x: x.geom_type)
        # Validatetwo sec
        try:
            validate(instance=df_new.to_dict('list'), schema=parse_yaml(schemaFile))
        except Exception as err:
            print(err)
        else:
            # Write to output
            df_new.to_file(outputFile, encoding='utf-8')
    print("Done.")