def cluster_spatial(dataset, n_clusters=6, weights_type="rook", method=None, best_model=False, columns=None, threshold_variable='count', threshold=10, **kwargs): """ Create a *spatial* geodemographic typology by running a cluster analysis on the metro area's neighborhood attributes and including a contiguity constraint. Parameters ---------- n_clusters : int the number of clusters to derive weights_type : str 'queen' or 'rook' spatial weights matrix specification method : str the clustering algorithm used to identify neighborhood types columns : list-like subset of columns on which to apply the clustering Returns ------- DataFrame """ assert columns, "You must provide a subset of columns as input" assert method, "You must choose a clustering algorithm to use" dataset = copy.deepcopy(dataset) if threshold_variable == "count": allcols = columns + ["year"] data = dataset.census[allcols].copy() data = data.dropna(how="any") data[columns] = data.groupby("year")[columns].apply( lambda x: (x - x.mean()) / x.std(ddof=0)) elif threshold_variable is not None: threshold_var = data[threshold_variable] allcols = list(columns).remove(threshold_variable) + ["year"] data = dataset.census[allcols].copy() data = data.dropna(how="any") data[columns] = data.groupby("year")[columns].apply( lambda x: (x - x.mean()) / x.std(ddof=0)) else: allcols = columns + ["year"] data = dataset.census[allcols].copy() data = data.dropna(how="any") data[columns] = data.groupby("year")[columns].apply( lambda x: (x - x.mean()) / x.std(ddof=0)) tracts = dataset.tracts def _build_data(data, tracts, year, weights_type): df = data.loc[data.year == year].copy().dropna(how="any") tracts = tracts.loc[tracts.geoid.isin(df.index)].copy() weights = {"queen": Queen, "rook": Rook} w = weights[weights_type].from_dataframe(tracts, idVariable="geoid") # drop islands from dataset and rebuild weights #df.drop(index=w.islands, inplace=True) #tracts.drop(index=w.islands, inplace=True) #w = weights[weights_type].from_dataframe(tracts, idVariable="geoid") knnw = KNN.from_dataframe(tracts, k=1, ids=tracts.geoid.tolist()) return df, w, knnw years = [1980, 1990, 2000, 2010] annual = [] for year in years: df, w, knnw = _build_data(data, tracts, year, weights_type) annual.append([df, w, knnw]) datasets = dict(zip(years, annual)) specification = { "azp": azp, "spenc": spenc, "ward_spatial": ward_spatial, "skater": skater, "max_p": max_p, } clusters = [] for _, val in datasets.items(): if threshold_variable == "count": threshold_var = np.ones(len(val[0])) val[1] = attach_islands(val[1], val[2]) elif threshold_variable is not None: threshold_var = threshold_var[threshold.index.isin( val[0].index)].values try: val[1] = attach_islands(val[1], val[2]) except: pass else: threshold_var = None model = specification[method](val[0].drop(columns="year"), w=val[1], n_clusters=n_clusters, threshold_variable=threshold_var, threshold=threshold, **kwargs) labels = model.labels_.astype(str) labels = pd.DataFrame({ method: labels, "year": val[0].year, "geoid": val[0].index }) clusters.append(labels) clusters = pd.concat(clusters) clusters.set_index("geoid") clusters["joinkey"] = clusters.geoid + clusters.year.astype(str) clusters = clusters.drop(columns="year") geoid = dataset.census.index dataset.census[ "joinkey"] = dataset.census.index + dataset.census.year.astype(str) if method in dataset.census.columns: dataset.census.drop(columns=method, inplace=True) dataset.census = dataset.census.merge(clusters, on="joinkey", how="left") dataset.census["geoid"] = geoid dataset.census.set_index("geoid", inplace=True) return dataset
def cluster_spatial( gdf, n_clusters=6, spatial_weights="rook", method=None, columns=None, threshold_variable="count", threshold=10, time_var="year", id_var="geoid", scaler="std", weights_kwargs=None, **kwargs, ): """Create a *spatial* geodemographic typology by running a cluster analysis on the metro area's neighborhood attributes and including a contiguity constraint. Parameters ---------- gdf : geopandas.GeoDataFrame long-form geodataframe holding neighborhood attribute and geometry data. n_clusters : int the number of clusters to model. The default is 6). spatial_weights : ['queen', 'rook'] or libpysal.weights.W object spatial weights matrix specification`. By default, geosnap will calculate Rook weights, but you can also pass a libpysal.weights.W object for more control over the specification. method : str in ['ward_spatial', 'spenc', 'skater', 'azp', 'max_p'] the clustering algorithm used to identify neighborhood types columns : array-like subset of columns on which to apply the clustering threshold_variable : str for max-p, which variable should define `p`. The default is "count", which will grow regions until the threshold number of polygons have been aggregated threshold : numeric threshold to use for max-p clustering (the default is 10). time_var : str which column on the dataframe defines time and or sequencing of the long-form data. Default is "year" id_var : str which column on the long-form dataframe identifies the stable units over time. In a wide-form dataset, this would be the unique index weights_kwargs : dict If passing a libpysal.weights.W instance to spatial_weights, these additional keyword arguments that will be passed to the weights constructor scaler : None or scaler class from sklearn.preprocessing a scikit-learn preprocessing class that will be used to rescale the data. Defaults to sklearn.preprocessing.StandardScaler Returns ------- gdf : geopandas.GeoDataFrame GeoDataFrame with a column of neighborhood cluster labels appended as a new column. If cluster method exists as a column on the DataFrame then the column will be incremented. models : dict of named tuples tab-completable dictionary of named tuples keyed on the Community's time variable (e.g. year). The tuples store model results and have attributes X, columns, labels, instance, W, which store the input matrix, column labels, fitted model instance, and spatial weights matrix model_name : str name of model to be stored in a Community """ specification = { "azp": azp, "spenc": spenc, "ward_spatial": ward_spatial, "skater": skater, "max_p": max_p, } if method not in specification.keys(): raise ValueError( "`method` must be one of ['ward_spatial', 'spenc', 'skater', 'azp', 'max_p']" ) if method in gdf.columns.tolist(): model_name = method + str( len(gdf.columns[gdf.columns.str.startswith(method)])) else: model_name = method if not columns: raise ValueError("You must provide a subset of columns as input") if not method: raise ValueError("You must choose a clustering algorithm to use") if scaler == "std": scaler = StandardScaler() times = gdf[time_var].unique() gdf = gdf.set_index([time_var, id_var]) # this is the dataset we'll operate on data = gdf.copy()[columns + ["geometry"]] contiguity_weights = {"queen": Queen, "rook": Rook} if spatial_weights in contiguity_weights.keys(): W = contiguity_weights[spatial_weights] else: W = spatial_weights models = _Map() ws = {} clusters = [] gdf[model_name] = np.nan # loop over each time period, standardize the data and build a weights matrix for time in times: df = data.loc[time].dropna(how="any", subset=columns).reset_index() df[time_var] = time if scaler: df[columns] = scaler.fit_transform(df[columns].values) if weights_kwargs: w0 = W.from_dataframe(df, **weights_kwargs) else: w0 = W.from_dataframe(df) w1 = KNN.from_dataframe(df, k=1) ws = [w0, w1] if threshold_variable and threshold_variable != "count": data[threshold_variable] = gdf[threshold_variable] threshold_var = data.threshold_variable.values ws[0] = attach_islands(ws[0], ws[1]) elif threshold_variable == "count": threshold_var = np.ones(len(data.loc[time])) ws[0] = attach_islands(ws[0], ws[1]) else: threshold_var = None model = specification[method]( df[columns], w=ws[0], n_clusters=n_clusters, threshold_variable=threshold_var, threshold=threshold, **kwargs, ) labels = model.labels_.astype(str) clusters = pd.DataFrame({ model_name: labels, time_var: df[time_var], id_var: df[id_var] }) clusters = clusters.drop_duplicates(subset=[id_var]) clusters.set_index([time_var, id_var], inplace=True) gdf.update(clusters) results = ModelResults( X=df[columns].values, columns=columns, labels=model.labels_, instance=model, W=ws[0], ) models[time] = results gdf = gdf.reset_index() return gdf, models, model_name
def cluster_spatial( gdf, n_clusters=6, spatial_weights="rook", method=None, columns=None, threshold_variable="count", threshold=10, time_var="year", id_var="geoid", return_model=False, scaler=None, **kwargs, ): """Create a *spatial* geodemographic typology by running a cluster analysis on the metro area's neighborhood attributes and including a contiguity constraint. Parameters ---------- gdf : geopandas.GeoDataFrame long-form geodataframe holding neighborhood attribute and geometry data. n_clusters : int the number of clusters to model. The default is 6). weights_type : str 'queen' or 'rook' spatial weights matrix specification` (the default is "rook"). method : str the clustering algorithm used to identify neighborhood types columns : list-like subset of columns on which to apply the clustering threshold_variable : str for max-p, which variable should define `p`. The default is "count", which will grow regions until the threshold number of polygons have been aggregated threshold : numeric threshold to use for max-p clustering (the default is 10). time_var: str which column on the dataframe defines time and or sequencing of the long-form data. Default is "year" id_var: str which column on the long-form dataframe identifies the stable units over time. In a wide-form dataset, this would be the unique index scaler: str or sklearn.preprocessing.Scaler a scikit-learn preprocessing class that will be used to rescale the data. Defaults to StandardScaler Returns ------- geopandas.GeoDataFrame with a column of neighborhood cluster labels appended as a new column. Will overwrite columns of the same name. """ if not columns: raise ValueError("You must provide a subset of columns as input") if not method: raise ValueError("You must choose a clustering algorithm to use") times = gdf[time_var].unique() gdf = gdf.set_index([time_var, id_var]) # this is the dataset we'll operate on data = gdf.copy()[columns + ["geometry"]] contiguity_weights = {"queen": Queen, "rook": Rook} if spatial_weights in contiguity_weights.keys(): W = contiguity_weights[spatial_weights] else: W = spatial_weights specification = { "azp": azp, "spenc": spenc, "ward_spatial": ward_spatial, "skater": skater, "max_p": max_p, } # if the user doesn't specify, use the standard scalar if not scaler: scaler = StandardScaler() ws = {} clusters = [] dfs = [] # loop over each time period, standardize the data and build a weights matrix for time in times: df = data.loc[time].dropna(how="any", subset=columns).reset_index() df[time_var] = time df[columns] = scaler.fit_transform(df[columns].values) w0 = W.from_dataframe(df) w1 = KNN.from_dataframe(df, k=1) ws = [w0, w1] # the rescalar can create nans if a column has no variance, so fill with 0 df = df.fillna(0) if threshold_variable and threshold_variable != "count": data[threshold_variable] = gdf[threshold_variable] threshold_var = data.threshold_variable.values ws[0] = attach_islands(ws[0], ws[1]) elif threshold_variable == "count": threshold_var = np.ones(len(data.loc[time])) ws[0] = attach_islands(ws[0], ws[1]) else: threshold_var = None model = specification[method]( df[columns], w=ws[0], n_clusters=n_clusters, threshold_variable=threshold_var, threshold=threshold, **kwargs, ) labels = model.labels_.astype(str) clusters = pd.DataFrame({ method: labels, time_var: df[time_var], id_var: df[id_var] }) clusters.set_index([time_var, id_var], inplace=True) dfs.append(gdf.loc[time].join(clusters, how="left")) gdf = pd.concat(dfs).reset_index() if return_model: return gdf, model return gdf