def __init__(self, coordinates, **kwargs): try: from numba import njit except ModuleNotFoundError: warnings.warn( "The numba package is used extensively in this module" " to accelerate the computation of graphs. Without numba," " these computations may become unduly slow on large data.") edges, _ = self._voronoi_edges(coordinates) voronoi_neighbors = pandas.DataFrame(edges).groupby(0)[1].apply( list).to_dict() W.__init__(self, voronoi_neighbors, **kwargs)
def __init__(self, coordinates, **kwargs): try: from numba import njit except ModuleNotFoundError: warnings.warn( "The numba package is used extensively in this module" " to accelerate the computation of graphs. Without numba," " these computations may become unduly slow on large data.") edges, _ = self._voronoi_edges(coordinates) edges, dt = self._voronoi_edges(coordinates) droplist = _filter_gabriel( edges, dt.points, ) output = set(map(tuple, edges)).difference(set(droplist)) gabriel_neighbors = pandas.DataFrame(output).groupby(0)[1].apply( list).to_dict() W.__init__(self, gabriel_neighbors, **kwargs)
def __init__(self, coordinates, binary=True, **kwargs): try: from numba import njit except ModuleNotFoundError: warnings.warn( "The numba package is used extensively in this module" " to accelerate the computation of graphs. Without numba," " these computations may become unduly slow on large data.") edges, _ = self._voronoi_edges(coordinates) edges, dt = self._voronoi_edges(coordinates) output, dkmax = _filter_relativehood(edges, dt.points, return_dkmax=False) row, col, data = zip(*output) if binary: data = numpy.ones_like(col, dtype=float) sp = sparse.csc_matrix( (data, (row, col))) #TODO: faster way than this? tmp = WSP(sp).to_W() W.__init__(self, tmp.neighbors, tmp.weights, **kwargs)
def _geos_sw(features, tolerance=0, silence_warnings=False, resolution=5): """ Generate libpysal spatial weights object based on intersections of features. Intersecting features are denoted as neighbours. If tolerance > 0, all features within the set tolerance are denoted as neighbours. Parameters ---------- features : GeoDataFrame GeoDataFrame tolerance : float (default 0) minimal distance between colors silence_warnings : bool (default True) silence lilbpysal warnings (if min_distance is set) resolution : int (default 5) resolution of buffer if tolerance > 0 Returns ------- W : libpysal.weights.W spatial weights object """ try: from libpysal.weights import W except ImportError: raise ImportError("The 'libpysal' package is required.") neighbors = {} if tolerance > 0: features = features.copy() features["geometry"] = features.geometry.buffer( tolerance / 2, resolution) sindex = features.sindex for i, (ix, g) in enumerate(features.geometry.iteritems()): possible_matches_index = list(sindex.intersection(g.bounds)) possible_matches_index.remove(i) possible_matches = features.iloc[possible_matches_index] precise_matches = possible_matches.loc[possible_matches.intersects(g)] neighbors[ix] = list(precise_matches.index) return W(neighbors, silence_warnings=silence_warnings)
def add_block_gini(bldg_pop: gpd.GeoDataFrame, values_col: str, weights: Optional[libpysal.weights.weights.W] = None ) -> gpd.GeoDataFrame: """ Calculates the gini coefficient for the passed in values_col based on the passed in gpd.GeoDataFrame. """ if values_col not in bldg_pop.columns: raise IndexError(f'{values_col} was not found in bldg_pop') if weights is None: weights_dict = {bldg_pop.loc[index_val]['bldg_id']: bldg_pop[bldg_pop['block_id'] == bldg_pop.loc[index_val]['block_id']]['bldg_id'].tolist() for index_val in bldg_pop.index.tolist()} weights = W(weights_dict, ids=bldg_pop['bldg_id'].tolist()) bldg_pop['gini_'+values_col] = Gini(bldg_pop, values=values_col, spatial_weights=weights, unique_id='bldg_id').series return bldg_pop
def set_connections(self, alist=None, w=None, nx=None, verify=True): """ Setup the graph structure. Will be kept under the hood as an adjacency list and a PySAL W object, so if a networkx object is supplied, conversion is necessary. Only one argument is required to populate the connections. """ if all([nx is None, alist is None, W is None]): raise ValueError("At least one adjacency list, W, or networkx graph" " must be provided to conduct clustering.") if nx is not None and all([alist is None, W is None]): w = W.from_networkx(nx) w, alist = utils.get_w_and_alist(alist=alist, W=w, skip_verify=not verify) self.nx = nx if nx is not None else w.to_networkx() self.w = w self.alist = alist
def add_building_adjacency(bldg_pop: gpd.GeoDataFrame, weights: Optional[libpysal.weights.weights.W] = None ) -> gpd.GeoDataFrame: """ Calculates the building agency, roughly a ratio of number of buildings to number of "built-up patches". The "spatial_weights_higher" is created by defining every building in the same block to be connected to one another. The intention is to figure out whether the block is composed of adjoining buildings or whether it's primarily freestanding buildings. Defined here: https://www-sciencedirect-com.proxy.uchicago.edu/science/article/pii/S0169204617301275?via%3Dihub Documentation here: http://docs.momepy.org/en/stable/generated/momepy.BuildingAdjacency.html#momepy.BuildingAdjacency """ if weights is None: weights_dict = {bldg_pop.loc[index_val]['bldg_id']: bldg_pop[bldg_pop['block_id'] == bldg_pop.loc[index_val]['block_id']]['bldg_id'].tolist() for index_val in bldg_pop.index.tolist()} weights = W(weights_dict, ids=bldg_pop['bldg_id'].tolist()) bldg_pop['building_adjacency'] = BuildingAdjacency(bldg_pop, spatial_weights_higher=weights, unique_id='bldg_id').series return bldg_pop
def _geos_sw(features, tolerance=0, silence_warnings=False): """ Generate libpysal spatial weights object based on intersections of features. Intersecting features are denoted as neighbours. If tolerance > 0, all features within the set tolerance are denoted as neighbours. Parameters ---------- features : GeoDataFrame GeoDataFrame tolerance : float (default 0) minimal distance between colors silence_warnings : bool (default True) silence lilbpysal warnings (if min_distance is set) Returns ------- W : libpysal.weights.W spatial weights object """ neighbors = {} if tolerance > 0: features = features.copy() features["geometry"] = features.geometry.buffer(tolerance / 2, 5) sindex = features.sindex for i, (ix, f) in enumerate(features.iterrows()): g = f.geometry possible_matches_index = list(sindex.intersection(g.bounds)) possible_matches_index.remove(i) possible_matches = features.iloc[possible_matches_index] precise_matches = possible_matches.loc[possible_matches.intersects(g)] neighbors[ix] = list(precise_matches.index) return W(neighbors, silence_warnings=silence_warnings)
def cluster( gdf, n_clusters=6, method=None, best_model=False, columns=None, verbose=False, time_var="year", id_var="geoid", scaler='std', pooling="fixed", w=False, **kwargs, ): """Create a geodemographic typology by running a cluster analysis on the study area's neighborhood attributes. Parameters ---------- gdf : geopandas.GeoDataFrame, required long-form GeoDataFrame containing neighborhood attributes n_clusters : int, required the number of clusters to model. The default is 6). method : str in ['kmeans', 'ward', 'affinity_propagation', 'spectral','gaussian_mixture', 'hdbscan'], required the clustering algorithm used to identify neighborhood types best_model : bool, optional if using a gaussian mixture model, use BIC to choose the best n_clusters. (the default is False). columns : list-like, required subset of columns on which to apply the clustering verbose : bool, optional whether to print warning messages (the default is False). time_var : str, optional which column on the dataframe defines time and or sequencing of the long-form data. Default is "year" id_var : str, optional which column on the long-form dataframe identifies the stable units over time. In a wide-form dataset, this would be the unique index scaler : None or scaler from sklearn.preprocessing, optional a scikit-learn preprocessing class that will be used to rescale the data. Defaults to sklearn.preprocessing.StandardScaler pooling : ["fixed", "pooled", "unique"], optional (default='fixed') How to treat temporal data when applying scaling. Options include: * fixed : scaling is fixed to each time period * pooled : data are pooled across all time periods * unique : if scaling, apply the scaler to each time period, then generate clusters unique to each time period. Returns ------- gdf : geopandas.GeoDataFrame GeoDataFrame with a column of neighborhood cluster labels appended as a new column. If cluster method exists as a column on the DataFrame then the column will be incremented. model : named tuple A tuple with attributes X, columns, labels, instance, W, which store the input matrix, column labels, fitted model instance, and spatial weights matrix model_name : str name of model to be stored in a Community """ specification = { "ward": ward, "kmeans": kmeans, "affinity_propagation": affinity_propagation, "gaussian_mixture": gaussian_mixture, "spectral": spectral, "hdbscan": hdbscan, "spenc": spenc, "ward_spatial": ward_spatial, "skater": skater, "max_p": max_p } if scaler == "std": scaler = StandardScaler() if method not in specification.keys(): raise ValueError(f"`method` must of one of {specification.keys()}") # if we already have a column named after the clustering method, then increment it. if method in gdf.columns.tolist(): model_name = method + str( len(gdf.columns[gdf.columns.str.startswith(method)])) else: model_name = method if len(columns) < 1: raise ValueError("You must provide a subset of columns as input") times = gdf[time_var].unique() gdf = gdf.set_index([time_var, id_var]) # this is the dataset we'll operate on data = gdf.copy()[columns] data = data.dropna(how="any", subset=columns) if scaler: if pooling in ["fixed", "unique"]: # if fixed (or unique), scale within each time period for time in times: data.loc[time] = scaler.fit_transform(data.loc[time].values) elif pooling == "pooled": # if pooled, scale the whole series at once data.loc[:, columns] = scaler.fit_transform(data.values) # the rescalar can create nans if a column has no variance, so fill with 0 data = data.fillna(0) if pooling != "unique": if w: d = gdf.copy() d[columns] = data[columns] d = d.reset_index() dfs = [d[d.year == time] for time in times] data, w = pool_dfs(dfs) data = data.set_index([time_var, id_var])[columns] w = W(w) # run the cluster model then join the labels back to the original data model = specification[method]( data, n_clusters=n_clusters, best_model=best_model, verbose=verbose, w=w, **kwargs, ) labels = model.labels_.astype(str) data = data.reset_index() clusters = pd.DataFrame({ model_name: labels, time_var: data[time_var], id_var: data[id_var] }) clusters.set_index([time_var, id_var], inplace=True) clusters = clusters[~clusters.index.duplicated(keep='first')] gdf = gdf.join(clusters, how="left") gdf = gdf.reset_index() results = ModelResults(X=data.values, columns=columns, labels=model.labels_, instance=model, W=None) return gdf, results, model_name elif pooling == 'unique': models = _Map() gdf[model_name] = np.nan data = data.reset_index() for time in times: df = data[data[time_var] == time] model = specification[method]( df[columns], n_clusters=n_clusters, best_model=best_model, verbose=verbose, **kwargs, ) labels = model.labels_.astype(str) clusters = pd.DataFrame({ model_name: labels, time_var: time, id_var: df[id_var] }) clusters.set_index([time_var, id_var], inplace=True) gdf.update(clusters) results = ModelResults(X=df[columns].values, columns=columns, labels=model.labels_, instance=model, W=None) models[time] = results gdf = gdf.reset_index() return gdf, models, model_name
def cluster_spatial( gdf, n_clusters=6, spatial_weights="rook", method=None, columns=None, threshold_variable="count", threshold=10, time_var="year", id_var="geoid", scaler="std", weights_kwargs=None, **kwargs, ): """Create a *spatial* geodemographic typology. Run a cluster analysis on the metro area's neighborhood attributes and include a contiguity constraint. Parameters ---------- gdf : geopandas.GeoDataFrame long-form geodataframe holding neighborhood attribute and geometry data. n_clusters : int the number of clusters to model. The default is 6). spatial_weights : ['queen', 'rook'] or libpysal.weights.W object spatial weights matrix specification`. By default, geosnap will calculate Rook weights, but you can also pass a libpysal.weights.W object for more control over the specification. method : str in ['ward_spatial', 'spenc', 'skater', 'azp', 'max_p'] the clustering algorithm used to identify neighborhood types columns : array-like subset of columns on which to apply the clustering threshold_variable : str for max-p, which variable should define `p`. The default is "count", which will grow regions until the threshold number of polygons have been aggregated threshold : numeric threshold to use for max-p clustering (the default is 10). time_var : str which column on the dataframe defines time and or sequencing of the long-form data. Default is "year" id_var : str which column on the long-form dataframe identifies the stable units over time. In a wide-form dataset, this would be the unique index weights_kwargs : dict If passing a libpysal.weights.W instance to spatial_weights, these additional keyword arguments that will be passed to the weights constructor scaler : None or scaler class from sklearn.preprocessing a scikit-learn preprocessing class that will be used to rescale the data. Defaults to sklearn.preprocessing.StandardScaler Returns ------- gdf : geopandas.GeoDataFrame GeoDataFrame with a column of neighborhood cluster labels appended as a new column. If cluster method exists as a column on the DataFrame then the column will be incremented. models : dict of named tuples tab-completable dictionary of named tuples keyed on the Community's time variable (e.g. year). The tuples store model results and have attributes X, columns, labels, instance, W, which store the input matrix, column labels, fitted model instance, and spatial weights matrix model_name : str name of model to be stored in a Community """ specification = { "azp": azp, "spenc": spenc, "ward_spatial": ward_spatial, "skater": skater, "max_p": max_p, } if method not in specification.keys(): raise ValueError( "`method` must be one of ['ward_spatial', 'spenc', 'skater', 'azp', 'max_p']" ) if method in gdf.columns.tolist(): model_name = method + str( len(gdf.columns[gdf.columns.str.startswith(method)])) else: model_name = method if not columns: raise ValueError("You must provide a subset of columns as input") if not method: raise ValueError("You must choose a clustering algorithm to use") if scaler == "std": scaler = StandardScaler() times = gdf[time_var].unique() gdf = gdf.set_index([time_var, id_var]) # this is the dataset we'll operate on data = gdf.copy()[columns + ["geometry"]] contiguity_weights = {"queen": Queen, "rook": Rook} if spatial_weights in contiguity_weights.keys(): W = contiguity_weights[spatial_weights] else: W = spatial_weights models = _Map() ws = {} clusters = [] gdf[model_name] = np.nan # loop over each time period, standardize the data and build a weights matrix for time in times: df = data.loc[time].dropna(how="any", subset=columns).reset_index() df[time_var] = time if scaler: df[columns] = scaler.fit_transform(df[columns].values) if weights_kwargs: w0 = W.from_dataframe(df, **weights_kwargs) else: w0 = W.from_dataframe(df) w1 = KNN.from_dataframe(df, k=1) ws = [w0, w1] if threshold_variable and threshold_variable != "count": data[threshold_variable] = gdf[threshold_variable] threshold_var = data.threshold_variable.values ws[0] = attach_islands(ws[0], ws[1]) elif threshold_variable == "count": threshold_var = np.ones(len(data.loc[time])) ws[0] = attach_islands(ws[0], ws[1]) else: threshold_var = None model = specification[method]( df[columns], w=ws[0], n_clusters=n_clusters, threshold_variable=threshold_var, threshold=threshold, **kwargs, ) labels = model.labels_.astype(str) clusters = pd.DataFrame({ model_name: labels, time_var: df[time_var], id_var: df[id_var] }) clusters = clusters.drop_duplicates(subset=[id_var]) clusters.set_index([time_var, id_var], inplace=True) gdf.update(clusters) results = ModelResults( X=df[columns].values, columns=columns, labels=model.labels_, instance=model, W=ws[0], ) models[time] = results gdf = gdf.reset_index() return gdf, models, model_name
def _validate_alist(self): np.testing.assert_array_equal(self.w.sparse.toarray(), W.from_adjlist(self.alist).sparse.toarray())