Example #1
0
def dbscan_reduce_vot(df, x='x', y='y'):
    start_time = time.time()
    # matrix of np arrays
    coords = df[['y', 'x']].values
    db = (DBSCAN(**yml['dbscan_vot']['params']).fit(coords))

    cluster_labels = db.labels_
    num_clusters = len(set(cluster_labels))

    clusters = pd.Series(
        [coords[cluster_labels == n] for n in range(num_clusters)])

    # find point in each cluster closest to its centroid
    centermost_points = clusters.map(get_centroid)

    # unzip list of centermost points (lat, lon)
    lats, lons = zip(*centermost_points)
    rep_points = pd.DataFrame({x: lons, y: lats})

    rs = rep_points.apply(lambda row: df[
        (df[y] == row[y]) & (df[x] == row[x])].iloc[0],
                          axis=1)
    rs = gpd.GeoDataFrame(rs, geometry='geometry', crs=yml['crs']['crs'])

    logger.info(
        "Clustered {:,} verblijfsobjecten down to {:,} vot_clusters, for {:.2f}% compression in {:,.2f} sec."
        .format(len(df), len(rs), 100 * (1 - float(len(rs)) / len(df)),
                time.time() - start_time))

    return rs
Example #2
0
def get_stag_table65():
    df = pd.read_csv(yml['path']['data_stag_tables'] +
                     yml['file_stag_tables']['clusters65'],
                     dtype=str)
    df['geometry'] = df['geometry'].apply(lambda x: wkt.loads(x))
    df['pnd_geom'] = df['pnd_geom'].apply(lambda x: wkt.loads(x))
    df = gpd.GeoDataFrame(df, geometry='geometry', crs=yml['crs']['crs'])

    logger.info("GeoDataFrame has shape: {} and crs: {}".format(df.shape, crs))

    return df
def create_distance_matrix_afval(df1,
                                 df2,
                                 fractie=str,
                                 buffer=int,
                                 include_nearest_point=None,
                                 n=int):
    """
    calculate distance matrix frames. See ../helper_functions/distance_matrix.py 
    Make sure the you feed geoPandas df with a geometry column
    args:
        df1 : dataframe one containing geometry column (points)
        df2 : dataframe  with clustered afvalcontainers 
              containing geometry column (points)
        buffer: buffer in meters around the geometry column in df2 
        include_nearest_point: Find nearest point,return corresponding value 
        from specified column.Caution very slow on big sets.
        n : number of iterations. set n=len(df2) to loop through full set
    """
    df2 = df2[df2['fractie'] == fractie].reset_index()
    df2['buffer'] = df2['geometry'].buffer(buffer)
    n = n

    logger.info('Building dm (fractie: {}, buffer: {} with {} iterations' \
                .format(fractie, buffer, n))

    stag_distance = []

    for i, row in enumerate(tqdm_notebook(list(df2['buffer'][:n]))):
        sub_df = df1.loc[(df1.geometry.within(df2['buffer'][i])), :]
        sub_df = (sub_df.apply(calculate_distance,
                               dest_geom=df2['geometry'][i],
                               target_col='distance',
                               axis=1))
        print('Shape sub_df {} = {}'.format(i, sub_df.shape))

        if include_nearest_point:
            indices = (sub_df.apply(find_nearest_point,
                                    geom_union=df2.unary_union,
                                    df1=sub_df,
                                    df2=df2,
                                    geom1_col='geometry',
                                    src_column='container_id',
                                    axis=1))

            indices_frame = indices.to_frame()
            sub_df = pd.concat([sub_df, indices_frame], axis=1)
            stag_distance.append(sub_df)

        stag_distance.append(sub_df)

    return stag_distance
Example #4
0
def get_distance_matrices(path, file):
    """
    load in the munged distance matrices resulting from function 
    distance_matrix/deduplicate_distance_matrix_.. 
    """
    df = pd.read_csv(path + file)
    geo_cols = ['geometry', 'geom_point']
    for col in df[geo_cols]:
        df[col] = df[col].apply(lambda x: wkt.loads(x))
    df = df.drop('_merge', axis=1)
    df = gpd.GeoDataFrame(df, geometry='geometry', crs=yml['crs']['crs'])

    logger.info("GeoDataFrame has shape: {} and crs: {}".format(df.shape, crs))

    return df
def get_available_layers_from_wfs(url_wfs):
    """
        Get all layer names in WFS service, print and return them in a list.
    """
    layer_names = []
    parameters = {
        "REQUEST": "GetCapabilities",
        "SERVICE": "WFS"
    }

    getcapabilities = requests.get(url_wfs, params=parameters)
    root = ET.fromstring(getcapabilities.text)

    for neighbor in root.iter('{http://www.opengis.net/wfs/2.0}FeatureType'):
        logger.info("layername: " + neighbor[1].text)
        layer_names.append(neighbor[1].text)
        
    return layer_names
Example #6
0
def get_df2(path, file, plot=bool):
    """
    load dataframe with the second poi geometries
    currently choice from ah, oba
    """
    df = pd.read_csv(path + file, dtype=str)

    logger.info("Loading {} GeoDataFrame, with shape: {} and crs: {}".format(
        file, df.shape, crs))

    if df.columns.str.contains('geom').any():
        df = df.rename(columns={'geom': 'geometry'})

    df['geometry'] = df['geometry'].apply(lambda x: wkb.loads(x, hex=True))
    df = gpd.GeoDataFrame(df, crs=crs, geometry='geometry')

    # join stadsdelen on poi2
    std = wfs.get_sd_layer()
    df = gpd.sjoin(df, std, how='inner', op='intersects')
    logger.info(
        "Spatial join of {} GeoDataFrame and Amsterdam district layer. \
    Added columns : {}".format(file, std.columns.tolist()))

    if file == yml['file']['ah']:
        buffer = 1000
        df['buffer'] = df['geometry'].buffer(buffer)
        logger.info("created {} meter buffer around {} geometry".format(
            buffer, df.geometry.geom_type[0]))
    else:
        buffer = 2000
        df['buffer'] = df['geometry'].buffer(buffer)
        logger.info("created {} meter buffer around {} geometry".format(
            buffer, df.geometry.geom_type[0]))

    if plot:
        fig, ax = plt.subplots(figsize=[15, 7])
        ax = std.plot(ax=ax)
        df.plot(ax=ax, color='red', alpha=.5, marker='*')
        gpd.GeoSeries(df.geometry.buffer(buffer)).plot(ax=ax,
                                                       color='yellow',
                                                       alpha=.085)

    return df.reset_index(drop=True)
def get_layer_from_wfs(url_wfs, layer_name, crs, outputformat, retry_count=3):
    """
    Get layer from a wfs service.
    Args:
        1. url_wfs: full url of the WFS including https, excluding /?::
            https://map.data.amsterdam.nl/maps/gebieden
        2. layer_name: Title of the layer:: f.i. stadsdeel
        3. crs: coordinate system number, excluding EPSG::
            28992, 4326
        4. outputformat: leave empty to return standard GML (Geographic Markup language),
           otherwise: json, geojson, txt, shapezip
    Returns:
        The layer in the specified output format.
    """ 

    parameters = {
        "REQUEST": "GetFeature",
        "TYPENAME": layer_name,
        "SERVICE": "WFS",
        "VERSION": "2.0.0",
        "SRSNAME": "EPSG:{}".format(crs),
        "OUTPUTFORMAT": outputformat
    }

    logger.info("Requesting data from {}, layer: {}".format(
        url_wfs, layer_name))

    retry = 0

    # webrequests sometimes fail..
    while retry < retry_count:
        response = requests.get(url_wfs, params=parameters)
        logger.debug(response)
        if response.status_code == 400:
            logger.info("Incorrect layer name: {}, please correct the layer name".format(layer_name))
            continue
        if response.status_code != 200:
            time.sleep(3)
            # try again..
            retry += 1
        else:
            # status 200. Yeah!.
            break

    if outputformat in ('geojson, json'):
        geojson = response.json()
        logger.info("{} features returned.".format(str(len(geojson["features"]))))
        return geojson
    
    return response
Example #8
0
def get_df1(path, bag_file, add_brp_18=None, add_brp_65=None, plot=None):
    """
    load bag data with or w/o 18/65 additional info
    args:
        path = path to the data folder
        bag_file = bag_file. See yml['file'] for options
     
    """
    # bag clusters
    df = pd.read_csv(path + bag_file, dtype=str)

    logger.info("Loading {} GeoDataFrame, with shape: {} and crs: {}".format(
        bag_file, df.shape, crs))

    df['geometry'] = df['cl_geom'].apply(lambda x: wkb.loads(x, hex=True))
    df = gpd.GeoDataFrame(df, crs=yml['crs']['crs'], geometry='geometry')
    df = df.drop('cl_geom', axis=1)

    if bag_file == yml['file']['bag_full']:
        df['pnd_geom'] = df['pnd_geom'].apply(lambda x: wkb.loads(x, hex=True))

    if add_brp_18:
        if bag_file == yml['file']['bag_full']:

            vot18 = pd.read_csv(yml['path']['data_path_brp'] +
                                yml['file']['vot18'],
                                sep=';',
                                dtype=str)
            vot18['18'] = 18
            df = pd.merge(df,
                          vot18[['lv_bag_vot_id', '18']],
                          left_on=['landelijk_vot_id'],
                          right_on=['lv_bag_vot_id'],
                          how='left',
                          indicator=True)

            logger.info("Matched {} rows with left_only join".format(
                df._merge.value_counts()[0]))

            return df
        else:
            print('add_brp_18 not applicable to bag_clusters dataset')

    if add_brp_65:
        if bag_file == yml['file']['bag_full']:
            vot65 = pd.read_csv(yml['path']['data_path_brp'] +
                                yml['file']['vot65'],
                                sep=';',
                                dtype=str)
            vot65['65'] = 65
            df = pd.merge(df,
                          vot65[['lv_bag_vot_id', '65']],
                          left_on=['landelijk_vot_id'],
                          right_on=['lv_bag_vot_id'],
                          how='left',
                          indicator=True)

            logger.info("Matched {} rows with left_only join".format(
                df._merge.value_counts()[0]))
            return df
        else:
            print('add_brp_65 not applicable to bag_clusters dataset')

    if plot:
        n = 1000
        fig, ax = plt.subplots(figsize=[15, 7])
        logger.info("Plotting {} POINTS".format(n))
        df[:n].plot(ax=ax, color='blue', alpha=.5)

    return df
Example #9
0
def get_afvalcontainers_full_df(column_subset=None):

    # load geojson
    params = yml['afvalcontainers']['params']
    url = yml['afvalcontainers']['url']
    response = requests.get(url=url, params=params)
    data = response.json()

    # parse the json, give nice names
    results = []

    for item in data['features']:
        result_dict = {}

        result_dict['geometry'] = item['geometry']['coordinates']
        result_dict['active'] = item['properties']['active']
        result_dict['buurt_code'] = item['properties']['buurt_code']
        result_dict['container_type_id'] = item['properties'][
            'container_type_id']
        result_dict['container_id'] = item['properties']['id']
        result_dict['id_number'] = item['properties']['id_number']
        result_dict['operate_date'] = item['properties']['operational_date']
        result_dict['owner'] = item['properties']['owner']
        result_dict['place_date'] = item['properties']['placing_date']
        result_dict['serial_number'] = item['properties']['serial_number']
        result_dict['stadsdeel'] = item['properties']['stadsdeel']
        result_dict['address'] = item['properties']['text']
        result_dict['fractie'] = item['properties']['waste_name']
        result_dict['fractie_type'] = item['properties']['waste_type']

        results.append(result_dict)

    df = gpd.GeoDataFrame(results, crs=yml['crs']['crs_4326'])

    # filter out messy fracties/ waste_types
    df = df[(df['fractie'].isin(yml['afvalcontainers']['fracties']))]

    # convert geometry column to Points
    df['geometry'] = [Point(xy) for xy in df['geometry']]

    #flatten the 'owner column, merge back on df, drop owner column
    owner = json_normalize(df['owner'])
    df = (pd.merge(df, owner, left_index=True,
                   right_index=True).drop(labels=['owner'], axis=1))

    if column_subset:
        keep_cols = ['container_id', 'geometry', 'fractie']
        df = df[keep_cols]

    # to crs 28992
    df = df.to_crs(crs=yml['crs']['crs'])
    # filter an annoying outlier
    df['x'] = df['geometry'].x
    df['y'] = df['geometry'].y

    df = df[((df.x >= 110000) & (df.x <= 135000) & (df.y >= 475000) &
             (df.y <= 494000))]

    df = df.reset_index(drop=True)
    logger.info("index has been reset")
    logger.info("Afvalcontainers_full df has shape: {} and crs: {}".format(
        df.shape, crs))

    return df
def deduplicate_distance_matrix_general(stag_distance, df1, buffer):
    """
    steps to munge the raw distance matrix into a clean deduplicated version
    args:
        stag_distance: frame resulting from create_distance_matrix_afval function
        df1 : same dataframe one containing geometry column (points) as fed to the create_distance_matrix_afval function
    """

    stag_dm = pd.concat(stag_distance, axis=0, sort=False)
    stag_dm.distance = stag_dm['distance'].astype(float)
    logger.info('Raw dm has shape:{}'.format(stag_dm.shape))

    dist_agg = stag_dm.groupby(['cluster_toewijzing']).agg(f).reset_index()
    dist_agg.columns = [
        f'{i}_{j}' if j != '' else f'{i}' for i, j in dist_agg.columns
    ]

    keep_cols = [
        'landelijk_pnd_id', 'pnd_geom', 'geometry', 'cluster_toewijzing'
    ]
    dist_agg = pd.merge(df1[keep_cols],
                        dist_agg,
                        on=['cluster_toewijzing'],
                        how='left',
                        indicator=True)
    logger.info('Aggregated dm has shape:{}'.format(dist_agg.shape))
    logger.info('merge results:\n{}'.format(dist_agg._merge.value_counts()))

    distance_cols = ['distance_min', 'distance_max', 'distance_mean']
    for col in dist_agg[distance_cols]:
        dist_agg[col] = (
            dist_agg[col].astype(float).multiply(1000).fillna(buffer + 1).map(
                '{:.0f}'.format))
        dist_agg[col] = dist_agg[col].astype(int)
    logger.info('Filled values above the {} buffer boundaries with value: {}'\
               .format(buffer, buffer + 1))

    # we want to plot on pnd_geom, Geopandas only accepts 'geometry' for plotting, so:
    dist_agg = dist_agg.rename(columns={
        'geometry': 'geom_point',
        'pnd_geom': 'geometry'
    })

    pnd_mean_dist = dist_agg.groupby(
        ['landelijk_pnd_id'])['distance_min'].mean().to_frame().reset_index()
    pnd_mean_dist = pnd_mean_dist.rename(
        columns={'distance_min': 'pnd_dist_mean'})
    pnd_mean_dist.pnd_dist_mean = pnd_mean_dist.pnd_dist_mean.map(
        '{:.0f}'.format)

    final = pd.merge(dist_agg,
                     pnd_mean_dist,
                     on=['landelijk_pnd_id'],
                     how='left')
    logger.info('Final dm has shape:{}'.format(final.shape))
    logger.info('columns dm {}'.format(final.columns.tolist()))

    # plot histograms
    num_cols = ['distance_min', 'distance_max', 'distance_mean']
    fig, ax = plt.subplots(len(num_cols), 1, figsize=[9, 6])

    print('histogram numerical distance columns: ')
    for i, col in enumerate(final[num_cols].columns):
        final[final.distance_min <= buffer][col].dropna().hist(bins=40,
                                                               ax=ax[i])
        ax[i].set_title(col)
        plt.tight_layout()

    return final