コード例 #1
0
ファイル: gabriel.py プロジェクト: pysal/libpysal
 def __init__(self, coordinates, **kwargs):
     try:
         from numba import njit
     except ModuleNotFoundError:
         warnings.warn(
             "The numba package is used extensively in this module"
             " to accelerate the computation of graphs. Without numba,"
             " these computations may become unduly slow on large data.")
     edges, _ = self._voronoi_edges(coordinates)
     voronoi_neighbors = pandas.DataFrame(edges).groupby(0)[1].apply(
         list).to_dict()
     W.__init__(self, voronoi_neighbors, **kwargs)
コード例 #2
0
ファイル: gabriel.py プロジェクト: pysal/libpysal
 def __init__(self, coordinates, **kwargs):
     try:
         from numba import njit
     except ModuleNotFoundError:
         warnings.warn(
             "The numba package is used extensively in this module"
             " to accelerate the computation of graphs. Without numba,"
             " these computations may become unduly slow on large data.")
     edges, _ = self._voronoi_edges(coordinates)
     edges, dt = self._voronoi_edges(coordinates)
     droplist = _filter_gabriel(
         edges,
         dt.points,
     )
     output = set(map(tuple, edges)).difference(set(droplist))
     gabriel_neighbors = pandas.DataFrame(output).groupby(0)[1].apply(
         list).to_dict()
     W.__init__(self, gabriel_neighbors, **kwargs)
コード例 #3
0
ファイル: gabriel.py プロジェクト: pysal/libpysal
 def __init__(self, coordinates, binary=True, **kwargs):
     try:
         from numba import njit
     except ModuleNotFoundError:
         warnings.warn(
             "The numba package is used extensively in this module"
             " to accelerate the computation of graphs. Without numba,"
             " these computations may become unduly slow on large data.")
     edges, _ = self._voronoi_edges(coordinates)
     edges, dt = self._voronoi_edges(coordinates)
     output, dkmax = _filter_relativehood(edges,
                                          dt.points,
                                          return_dkmax=False)
     row, col, data = zip(*output)
     if binary:
         data = numpy.ones_like(col, dtype=float)
     sp = sparse.csc_matrix(
         (data, (row, col)))  #TODO: faster way than this?
     tmp = WSP(sp).to_W()
     W.__init__(self, tmp.neighbors, tmp.weights, **kwargs)
コード例 #4
0
ファイル: greedy.py プロジェクト: rsmahabir/mapclassify
def _geos_sw(features, tolerance=0, silence_warnings=False, resolution=5):
    """
    Generate libpysal spatial weights object based on intersections of features.

    Intersecting features are denoted as neighbours. If tolerance > 0, all features
    within the set tolerance are denoted as neighbours.


    Parameters
    ----------
    features : GeoDataFrame
        GeoDataFrame
    tolerance : float (default 0)
        minimal distance between colors
    silence_warnings : bool (default True)
        silence lilbpysal warnings (if min_distance is set)
    resolution : int (default 5)
        resolution of buffer if tolerance > 0

    Returns
    -------
    W : libpysal.weights.W
        spatial weights object
    """
    try:
        from libpysal.weights import W
    except ImportError:
        raise ImportError("The 'libpysal' package is required.")

    neighbors = {}

    if tolerance > 0:
        features = features.copy()
        features["geometry"] = features.geometry.buffer(
            tolerance / 2, resolution)

    sindex = features.sindex

    for i, (ix, g) in enumerate(features.geometry.iteritems()):

        possible_matches_index = list(sindex.intersection(g.bounds))
        possible_matches_index.remove(i)
        possible_matches = features.iloc[possible_matches_index]
        precise_matches = possible_matches.loc[possible_matches.intersects(g)]

        neighbors[ix] = list(precise_matches.index)

    return W(neighbors, silence_warnings=silence_warnings)
コード例 #5
0
def add_block_gini(bldg_pop: gpd.GeoDataFrame,
                   values_col: str,
                   weights: Optional[libpysal.weights.weights.W] = None
                   ) -> gpd.GeoDataFrame:
    """
    Calculates the gini coefficient for the passed in values_col based on the passed in gpd.GeoDataFrame.
    """
    if values_col not in bldg_pop.columns:
        raise IndexError(f'{values_col} was not found in bldg_pop')

    if weights is None:
        weights_dict = {bldg_pop.loc[index_val]['bldg_id']: bldg_pop[bldg_pop['block_id'] == bldg_pop.loc[index_val]['block_id']]['bldg_id'].tolist() for index_val in bldg_pop.index.tolist()}
        weights = W(weights_dict, ids=bldg_pop['bldg_id'].tolist())

    bldg_pop['gini_'+values_col] = Gini(bldg_pop, values=values_col, spatial_weights=weights, unique_id='bldg_id').series

    return bldg_pop
コード例 #6
0
ファイル: abstracts.py プロジェクト: terratenney/bounder
    def set_connections(self, alist=None, w=None, nx=None, verify=True):
        """
        Setup the graph structure. Will be kept under the hood as 
        an adjacency list and a PySAL W object, so if a networkx object
        is supplied, conversion is necessary.

        Only one argument is required to populate the connections. 
        """
        if all([nx is None, alist is None, W is None]):
            raise ValueError("At least one adjacency list, W, or networkx graph"
                             " must be provided to conduct clustering.")
        if nx is not None and all([alist is None, W is None]):
            w = W.from_networkx(nx)
        w, alist = utils.get_w_and_alist(alist=alist, W=w, 
                                           skip_verify=not verify)
        self.nx = nx if nx is not None else w.to_networkx()
        self.w = w
        self.alist = alist
コード例 #7
0
def add_building_adjacency(bldg_pop: gpd.GeoDataFrame, 
                           weights: Optional[libpysal.weights.weights.W] = None
                           ) -> gpd.GeoDataFrame:
    """
    Calculates the building agency, roughly a ratio of number of buildings to number of "built-up patches".
    The "spatial_weights_higher" is created by defining every building in the same block to be connected to one another.

    The intention is to figure out whether the block is composed of adjoining buildings or whether it's primarily freestanding
    buildings. 
    Defined here: https://www-sciencedirect-com.proxy.uchicago.edu/science/article/pii/S0169204617301275?via%3Dihub
    Documentation here: http://docs.momepy.org/en/stable/generated/momepy.BuildingAdjacency.html#momepy.BuildingAdjacency
    """
    if weights is None:
        weights_dict = {bldg_pop.loc[index_val]['bldg_id']: bldg_pop[bldg_pop['block_id'] == bldg_pop.loc[index_val]['block_id']]['bldg_id'].tolist() for index_val in bldg_pop.index.tolist()}
        weights = W(weights_dict, ids=bldg_pop['bldg_id'].tolist())

    bldg_pop['building_adjacency'] = BuildingAdjacency(bldg_pop, spatial_weights_higher=weights, unique_id='bldg_id').series

    return bldg_pop
コード例 #8
0
def _geos_sw(features, tolerance=0, silence_warnings=False):
    """
    Generate libpysal spatial weights object based on intersections of features.

    Intersecting features are denoted as neighbours. If tolerance > 0, all features
    within the set tolerance are denoted as neighbours.


    Parameters
    ----------
    features : GeoDataFrame
        GeoDataFrame
    tolerance : float (default 0)
        minimal distance between colors
    silence_warnings : bool (default True)
        silence lilbpysal warnings (if min_distance is set)

    Returns
    -------
    W : libpysal.weights.W
        spatial weights object
    """
    neighbors = {}

    if tolerance > 0:
        features = features.copy()
        features["geometry"] = features.geometry.buffer(tolerance / 2, 5)

    sindex = features.sindex

    for i, (ix, f) in enumerate(features.iterrows()):

        g = f.geometry

        possible_matches_index = list(sindex.intersection(g.bounds))
        possible_matches_index.remove(i)
        possible_matches = features.iloc[possible_matches_index]
        precise_matches = possible_matches.loc[possible_matches.intersects(g)]

        neighbors[ix] = list(precise_matches.index)

    return W(neighbors, silence_warnings=silence_warnings)
コード例 #9
0
def cluster(
    gdf,
    n_clusters=6,
    method=None,
    best_model=False,
    columns=None,
    verbose=False,
    time_var="year",
    id_var="geoid",
    scaler='std',
    pooling="fixed",
    w=False,
    **kwargs,
):
    """Create a geodemographic typology by running a cluster analysis on the study area's neighborhood attributes.

    Parameters
    ----------
    gdf : geopandas.GeoDataFrame, required
        long-form GeoDataFrame containing neighborhood attributes
    n_clusters : int, required
        the number of clusters to model. The default is 6).
    method : str in ['kmeans', 'ward', 'affinity_propagation', 'spectral','gaussian_mixture', 'hdbscan'], required
        the clustering algorithm used to identify neighborhood types
    best_model : bool, optional
        if using a gaussian mixture model, use BIC to choose the best
        n_clusters. (the default is False).
    columns : list-like, required
        subset of columns on which to apply the clustering
    verbose : bool, optional
        whether to print warning messages (the default is False).
    time_var : str, optional
        which column on the dataframe defines time and or sequencing of the
        long-form data. Default is "year"
    id_var : str, optional
        which column on the long-form dataframe identifies the stable units
        over time. In a wide-form dataset, this would be the unique index
    scaler : None or scaler from sklearn.preprocessing, optional
        a scikit-learn preprocessing class that will be used to rescale the
        data. Defaults to sklearn.preprocessing.StandardScaler
    pooling : ["fixed", "pooled", "unique"], optional (default='fixed')
        How to treat temporal data when applying scaling. Options include:

        * fixed : scaling is fixed to each time period
        * pooled : data are pooled across all time periods
        * unique : if scaling, apply the scaler to each time period, then generate
          clusters unique to each time period.

    Returns
    -------
    gdf : geopandas.GeoDataFrame
        GeoDataFrame with a column of neighborhood cluster labels
        appended as a new column. If cluster method exists as a column on the DataFrame
        then the column will be incremented.

    model : named tuple
        A tuple with attributes X, columns, labels, instance, W, which store the
        input matrix, column labels, fitted model instance, and spatial weights matrix

    model_name : str
        name of model to be stored in a Community

    """
    specification = {
        "ward": ward,
        "kmeans": kmeans,
        "affinity_propagation": affinity_propagation,
        "gaussian_mixture": gaussian_mixture,
        "spectral": spectral,
        "hdbscan": hdbscan,
        "spenc": spenc,
        "ward_spatial": ward_spatial,
        "skater": skater,
        "max_p": max_p
    }
    if scaler == "std":
        scaler = StandardScaler()
    if method not in specification.keys():
        raise ValueError(f"`method` must of one of {specification.keys()}")

    # if we already have a column named after the clustering method, then increment it.
    if method in gdf.columns.tolist():
        model_name = method + str(
            len(gdf.columns[gdf.columns.str.startswith(method)]))
    else:
        model_name = method
    if len(columns) < 1:
        raise ValueError("You must provide a subset of columns as input")

    times = gdf[time_var].unique()

    gdf = gdf.set_index([time_var, id_var])

    # this is the dataset we'll operate on
    data = gdf.copy()[columns]
    data = data.dropna(how="any", subset=columns)

    if scaler:

        if pooling in ["fixed", "unique"]:
            # if fixed (or unique), scale within each time period
            for time in times:
                data.loc[time] = scaler.fit_transform(data.loc[time].values)

        elif pooling == "pooled":
            # if pooled, scale the whole series at once
            data.loc[:, columns] = scaler.fit_transform(data.values)

    # the rescalar can create nans if a column has no variance, so fill with 0
    data = data.fillna(0)

    if pooling != "unique":
        if w:
            d = gdf.copy()
            d[columns] = data[columns]
            d = d.reset_index()
            dfs = [d[d.year == time] for time in times]
            data, w = pool_dfs(dfs)
            data = data.set_index([time_var, id_var])[columns]
            w = W(w)

        # run the cluster model then join the labels back to the original data
        model = specification[method](
            data,
            n_clusters=n_clusters,
            best_model=best_model,
            verbose=verbose,
            w=w,
            **kwargs,
        )
        labels = model.labels_.astype(str)
        data = data.reset_index()
        clusters = pd.DataFrame({
            model_name: labels,
            time_var: data[time_var],
            id_var: data[id_var]
        })
        clusters.set_index([time_var, id_var], inplace=True)
        clusters = clusters[~clusters.index.duplicated(keep='first')]
        gdf = gdf.join(clusters, how="left")
        gdf = gdf.reset_index()
        results = ModelResults(X=data.values,
                               columns=columns,
                               labels=model.labels_,
                               instance=model,
                               W=None)
        return gdf, results, model_name

    elif pooling == 'unique':
        models = _Map()
        gdf[model_name] = np.nan
        data = data.reset_index()

        for time in times:
            df = data[data[time_var] == time]

            model = specification[method](
                df[columns],
                n_clusters=n_clusters,
                best_model=best_model,
                verbose=verbose,
                **kwargs,
            )

            labels = model.labels_.astype(str)
            clusters = pd.DataFrame({
                model_name: labels,
                time_var: time,
                id_var: df[id_var]
            })
            clusters.set_index([time_var, id_var], inplace=True)
            gdf.update(clusters)
            results = ModelResults(X=df[columns].values,
                                   columns=columns,
                                   labels=model.labels_,
                                   instance=model,
                                   W=None)
            models[time] = results

        gdf = gdf.reset_index()

        return gdf, models, model_name
コード例 #10
0
def cluster_spatial(
    gdf,
    n_clusters=6,
    spatial_weights="rook",
    method=None,
    columns=None,
    threshold_variable="count",
    threshold=10,
    time_var="year",
    id_var="geoid",
    scaler="std",
    weights_kwargs=None,
    **kwargs,
):
    """Create a *spatial* geodemographic typology.

    Run a cluster analysis on the metro area's neighborhood attributes and include a
    contiguity constraint.

    Parameters
    ----------
    gdf : geopandas.GeoDataFrame
        long-form geodataframe holding neighborhood attribute and geometry data.
    n_clusters : int
        the number of clusters to model. The default is 6).
    spatial_weights : ['queen', 'rook'] or libpysal.weights.W object
        spatial weights matrix specification`. By default, geosnap will calculate Rook
        weights, but you can also pass a libpysal.weights.W object for more control
        over the specification.
    method : str in ['ward_spatial', 'spenc', 'skater', 'azp', 'max_p']
        the clustering algorithm used to identify neighborhood types
    columns : array-like
        subset of columns on which to apply the clustering
    threshold_variable : str
        for max-p, which variable should define `p`. The default is "count",
        which will grow regions until the threshold number of polygons have
        been aggregated
    threshold : numeric
        threshold to use for max-p clustering (the default is 10).
    time_var : str
        which column on the dataframe defines time and or sequencing of the
        long-form data. Default is "year"
    id_var : str
        which column on the long-form dataframe identifies the stable units
        over time. In a wide-form dataset, this would be the unique index
    weights_kwargs : dict
        If passing a libpysal.weights.W instance to spatial_weights, these additional
        keyword arguments that will be passed to the weights constructor
    scaler : None or scaler class from sklearn.preprocessing
        a scikit-learn preprocessing class that will be used to rescale the
        data. Defaults to sklearn.preprocessing.StandardScaler

    Returns
    -------
    gdf : geopandas.GeoDataFrame
        GeoDataFrame with a column of neighborhood cluster labels
        appended as a new column. If cluster method exists as a column on the DataFrame
        then the column will be incremented.

    models : dict of named tuples
        tab-completable dictionary of named tuples keyed on the Community's time variable
        (e.g. year). The tuples store model results and have attributes X, columns, labels,
        instance, W, which store the input matrix, column labels, fitted model instance,
        and spatial weights matrix

    model_name : str
        name of model to be stored in a Community

    """
    specification = {
        "azp": azp,
        "spenc": spenc,
        "ward_spatial": ward_spatial,
        "skater": skater,
        "max_p": max_p,
    }
    if method not in specification.keys():
        raise ValueError(
            "`method` must be one of  ['ward_spatial', 'spenc', 'skater', 'azp', 'max_p']"
        )

    if method in gdf.columns.tolist():
        model_name = method + str(
            len(gdf.columns[gdf.columns.str.startswith(method)]))
    else:
        model_name = method
    if not columns:
        raise ValueError("You must provide a subset of columns as input")
    if not method:
        raise ValueError("You must choose a clustering algorithm to use")
    if scaler == "std":
        scaler = StandardScaler()

    times = gdf[time_var].unique()
    gdf = gdf.set_index([time_var, id_var])

    # this is the dataset we'll operate on
    data = gdf.copy()[columns + ["geometry"]]

    contiguity_weights = {"queen": Queen, "rook": Rook}

    if spatial_weights in contiguity_weights.keys():
        W = contiguity_weights[spatial_weights]
    else:
        W = spatial_weights

    models = _Map()
    ws = {}
    clusters = []
    gdf[model_name] = np.nan

    # loop over each time period, standardize the data and build a weights matrix
    for time in times:
        df = data.loc[time].dropna(how="any", subset=columns).reset_index()
        df[time_var] = time

        if scaler:
            df[columns] = scaler.fit_transform(df[columns].values)

        if weights_kwargs:
            w0 = W.from_dataframe(df, **weights_kwargs)
        else:
            w0 = W.from_dataframe(df)
        w1 = KNN.from_dataframe(df, k=1)
        ws = [w0, w1]

        if threshold_variable and threshold_variable != "count":
            data[threshold_variable] = gdf[threshold_variable]
            threshold_var = data.threshold_variable.values
            ws[0] = attach_islands(ws[0], ws[1])

        elif threshold_variable == "count":
            threshold_var = np.ones(len(data.loc[time]))
            ws[0] = attach_islands(ws[0], ws[1])

        else:
            threshold_var = None

        model = specification[method](
            df[columns],
            w=ws[0],
            n_clusters=n_clusters,
            threshold_variable=threshold_var,
            threshold=threshold,
            **kwargs,
        )

        labels = model.labels_.astype(str)
        clusters = pd.DataFrame({
            model_name: labels,
            time_var: df[time_var],
            id_var: df[id_var]
        })
        clusters = clusters.drop_duplicates(subset=[id_var])
        clusters.set_index([time_var, id_var], inplace=True)
        gdf.update(clusters)
        results = ModelResults(
            X=df[columns].values,
            columns=columns,
            labels=model.labels_,
            instance=model,
            W=ws[0],
        )
        models[time] = results

    gdf = gdf.reset_index()

    return gdf, models, model_name
コード例 #11
0
ファイル: abstracts.py プロジェクト: terratenney/bounder
 def _validate_alist(self):
     np.testing.assert_array_equal(self.w.sparse.toarray(), 
                                W.from_adjlist(self.alist).sparse.toarray())