コード例 #1
0
ファイル: analytics.py プロジェクト: emorrow3/geosnap
def cluster_spatial(dataset,
                    n_clusters=6,
                    weights_type="rook",
                    method=None,
                    best_model=False,
                    columns=None,
                    threshold_variable='count',
                    threshold=10,
                    **kwargs):
    """

    Create a *spatial* geodemographic typology by running a cluster
    analysis on the metro area's neighborhood attributes and including a
    contiguity constraint.

    Parameters
    ----------
    n_clusters : int
        the number of clusters to derive
    weights_type : str 'queen' or 'rook'
        spatial weights matrix specification
    method : str
        the clustering algorithm used to identify neighborhood types
    columns : list-like
        subset of columns on which to apply the clustering

    Returns
    -------
    DataFrame

    """
    assert columns, "You must provide a subset of columns as input"
    assert method, "You must choose a clustering algorithm to use"
    dataset = copy.deepcopy(dataset)

    if threshold_variable == "count":
        allcols = columns + ["year"]
        data = dataset.census[allcols].copy()
        data = data.dropna(how="any")
        data[columns] = data.groupby("year")[columns].apply(
            lambda x: (x - x.mean()) / x.std(ddof=0))

    elif threshold_variable is not None:
        threshold_var = data[threshold_variable]
        allcols = list(columns).remove(threshold_variable) + ["year"]
        data = dataset.census[allcols].copy()
        data = data.dropna(how="any")
        data[columns] = data.groupby("year")[columns].apply(
            lambda x: (x - x.mean()) / x.std(ddof=0))

    else:
        allcols = columns + ["year"]
        data = dataset.census[allcols].copy()
        data = data.dropna(how="any")
        data[columns] = data.groupby("year")[columns].apply(
            lambda x: (x - x.mean()) / x.std(ddof=0))

    tracts = dataset.tracts

    def _build_data(data, tracts, year, weights_type):
        df = data.loc[data.year == year].copy().dropna(how="any")
        tracts = tracts.loc[tracts.geoid.isin(df.index)].copy()
        weights = {"queen": Queen, "rook": Rook}
        w = weights[weights_type].from_dataframe(tracts, idVariable="geoid")
        # drop islands from dataset and rebuild weights
        #df.drop(index=w.islands, inplace=True)
        #tracts.drop(index=w.islands, inplace=True)
        #w = weights[weights_type].from_dataframe(tracts, idVariable="geoid")
        knnw = KNN.from_dataframe(tracts, k=1, ids=tracts.geoid.tolist())

        return df, w, knnw

    years = [1980, 1990, 2000, 2010]
    annual = []
    for year in years:
        df, w, knnw = _build_data(data, tracts, year, weights_type)
        annual.append([df, w, knnw])

    datasets = dict(zip(years, annual))

    specification = {
        "azp": azp,
        "spenc": spenc,
        "ward_spatial": ward_spatial,
        "skater": skater,
        "max_p": max_p,
    }

    clusters = []
    for _, val in datasets.items():
        if threshold_variable == "count":
            threshold_var = np.ones(len(val[0]))
            val[1] = attach_islands(val[1], val[2])

        elif threshold_variable is not None:
            threshold_var = threshold_var[threshold.index.isin(
                val[0].index)].values
            try:
                val[1] = attach_islands(val[1], val[2])
            except:
                pass
        else:
            threshold_var = None
        model = specification[method](val[0].drop(columns="year"),
                                      w=val[1],
                                      n_clusters=n_clusters,
                                      threshold_variable=threshold_var,
                                      threshold=threshold,
                                      **kwargs)
        labels = model.labels_.astype(str)
        labels = pd.DataFrame({
            method: labels,
            "year": val[0].year,
            "geoid": val[0].index
        })
        clusters.append(labels)

    clusters = pd.concat(clusters)
    clusters.set_index("geoid")
    clusters["joinkey"] = clusters.geoid + clusters.year.astype(str)
    clusters = clusters.drop(columns="year")
    geoid = dataset.census.index
    dataset.census[
        "joinkey"] = dataset.census.index + dataset.census.year.astype(str)
    if method in dataset.census.columns:
        dataset.census.drop(columns=method, inplace=True)
    dataset.census = dataset.census.merge(clusters, on="joinkey", how="left")
    dataset.census["geoid"] = geoid
    dataset.census.set_index("geoid", inplace=True)

    return dataset
コード例 #2
0
ファイル: analytics.py プロジェクト: peterprescott/geosnap
def cluster_spatial(
    gdf,
    n_clusters=6,
    spatial_weights="rook",
    method=None,
    columns=None,
    threshold_variable="count",
    threshold=10,
    time_var="year",
    id_var="geoid",
    scaler="std",
    weights_kwargs=None,
    **kwargs,
):
    """Create a *spatial* geodemographic typology by running a cluster
    analysis on the metro area's neighborhood attributes and including a
    contiguity constraint.

    Parameters
    ----------
    gdf : geopandas.GeoDataFrame
        long-form geodataframe holding neighborhood attribute and geometry data.
    n_clusters : int
        the number of clusters to model. The default is 6).
    spatial_weights : ['queen', 'rook'] or libpysal.weights.W object
        spatial weights matrix specification`. By default, geosnap will calculate Rook
        weights, but you can also pass a libpysal.weights.W object for more control
        over the specification.
    method : str in ['ward_spatial', 'spenc', 'skater', 'azp', 'max_p']
        the clustering algorithm used to identify neighborhood types
    columns : array-like
        subset of columns on which to apply the clustering
    threshold_variable : str
        for max-p, which variable should define `p`. The default is "count",
        which will grow regions until the threshold number of polygons have
        been aggregated
    threshold : numeric
        threshold to use for max-p clustering (the default is 10).
    time_var : str
        which column on the dataframe defines time and or sequencing of the
        long-form data. Default is "year"
    id_var : str
        which column on the long-form dataframe identifies the stable units
        over time. In a wide-form dataset, this would be the unique index
    weights_kwargs : dict
        If passing a libpysal.weights.W instance to spatial_weights, these additional
        keyword arguments that will be passed to the weights constructor
    scaler : None or scaler class from sklearn.preprocessing
        a scikit-learn preprocessing class that will be used to rescale the
        data. Defaults to sklearn.preprocessing.StandardScaler

    Returns
    -------
    gdf : geopandas.GeoDataFrame
        GeoDataFrame with a column of neighborhood cluster labels
        appended as a new column. If cluster method exists as a column on the DataFrame
        then the column will be incremented.

    models : dict of named tuples
        tab-completable dictionary of named tuples keyed on the Community's time variable
        (e.g. year). The tuples store model results and have attributes X, columns, labels,
        instance, W, which store the input matrix, column labels, fitted model instance,
        and spatial weights matrix

    model_name : str
        name of model to be stored in a Community

    """
    specification = {
        "azp": azp,
        "spenc": spenc,
        "ward_spatial": ward_spatial,
        "skater": skater,
        "max_p": max_p,
    }
    if method not in specification.keys():
        raise ValueError(
            "`method` must be one of  ['ward_spatial', 'spenc', 'skater', 'azp', 'max_p']"
        )

    if method in gdf.columns.tolist():
        model_name = method + str(
            len(gdf.columns[gdf.columns.str.startswith(method)]))
    else:
        model_name = method
    if not columns:
        raise ValueError("You must provide a subset of columns as input")
    if not method:
        raise ValueError("You must choose a clustering algorithm to use")
    if scaler == "std":
        scaler = StandardScaler()

    times = gdf[time_var].unique()
    gdf = gdf.set_index([time_var, id_var])

    # this is the dataset we'll operate on
    data = gdf.copy()[columns + ["geometry"]]

    contiguity_weights = {"queen": Queen, "rook": Rook}

    if spatial_weights in contiguity_weights.keys():
        W = contiguity_weights[spatial_weights]
    else:
        W = spatial_weights

    models = _Map()
    ws = {}
    clusters = []
    gdf[model_name] = np.nan

    # loop over each time period, standardize the data and build a weights matrix
    for time in times:
        df = data.loc[time].dropna(how="any", subset=columns).reset_index()
        df[time_var] = time

        if scaler:
            df[columns] = scaler.fit_transform(df[columns].values)

        if weights_kwargs:
            w0 = W.from_dataframe(df, **weights_kwargs)
        else:
            w0 = W.from_dataframe(df)
        w1 = KNN.from_dataframe(df, k=1)
        ws = [w0, w1]

        if threshold_variable and threshold_variable != "count":
            data[threshold_variable] = gdf[threshold_variable]
            threshold_var = data.threshold_variable.values
            ws[0] = attach_islands(ws[0], ws[1])

        elif threshold_variable == "count":
            threshold_var = np.ones(len(data.loc[time]))
            ws[0] = attach_islands(ws[0], ws[1])

        else:
            threshold_var = None

        model = specification[method](
            df[columns],
            w=ws[0],
            n_clusters=n_clusters,
            threshold_variable=threshold_var,
            threshold=threshold,
            **kwargs,
        )

        labels = model.labels_.astype(str)
        clusters = pd.DataFrame({
            model_name: labels,
            time_var: df[time_var],
            id_var: df[id_var]
        })
        clusters = clusters.drop_duplicates(subset=[id_var])
        clusters.set_index([time_var, id_var], inplace=True)
        gdf.update(clusters)
        results = ModelResults(
            X=df[columns].values,
            columns=columns,
            labels=model.labels_,
            instance=model,
            W=ws[0],
        )
        models[time] = results

    gdf = gdf.reset_index()

    return gdf, models, model_name
コード例 #3
0
def cluster_spatial(
    gdf,
    n_clusters=6,
    spatial_weights="rook",
    method=None,
    columns=None,
    threshold_variable="count",
    threshold=10,
    time_var="year",
    id_var="geoid",
    return_model=False,
    scaler=None,
    **kwargs,
):
    """Create a *spatial* geodemographic typology by running a cluster
    analysis on the metro area's neighborhood attributes and including a
    contiguity constraint.

    Parameters
    ----------
    gdf : geopandas.GeoDataFrame
        long-form geodataframe holding neighborhood attribute and geometry data.
    n_clusters : int
        the number of clusters to model. The default is 6).
    weights_type : str 'queen' or 'rook'
        spatial weights matrix specification` (the default is "rook").
    method : str
        the clustering algorithm used to identify neighborhood types
    columns : list-like
        subset of columns on which to apply the clustering
    threshold_variable : str
        for max-p, which variable should define `p`. The default is "count",
        which will grow regions until the threshold number of polygons have
        been aggregated
    threshold : numeric
        threshold to use for max-p clustering (the default is 10).
    time_var: str
        which column on the dataframe defines time and or sequencing of the
        long-form data. Default is "year"
    id_var: str
        which column on the long-form dataframe identifies the stable units
        over time. In a wide-form dataset, this would be the unique index
    scaler: str or sklearn.preprocessing.Scaler
        a scikit-learn preprocessing class that will be used to rescale the
        data. Defaults to StandardScaler

    Returns
    -------
    geopandas.GeoDataFrame with a column of neighborhood cluster labels
    appended as a new column. Will overwrite columns of the same name.

    """
    if not columns:
        raise ValueError("You must provide a subset of columns as input")
    if not method:
        raise ValueError("You must choose a clustering algorithm to use")

    times = gdf[time_var].unique()
    gdf = gdf.set_index([time_var, id_var])

    # this is the dataset we'll operate on
    data = gdf.copy()[columns + ["geometry"]]

    contiguity_weights = {"queen": Queen, "rook": Rook}

    if spatial_weights in contiguity_weights.keys():
        W = contiguity_weights[spatial_weights]
    else:
        W = spatial_weights

    specification = {
        "azp": azp,
        "spenc": spenc,
        "ward_spatial": ward_spatial,
        "skater": skater,
        "max_p": max_p,
    }

    # if the user doesn't specify, use the standard scalar
    if not scaler:
        scaler = StandardScaler()

    ws = {}
    clusters = []
    dfs = []
    # loop over each time period, standardize the data and build a weights matrix
    for time in times:
        df = data.loc[time].dropna(how="any", subset=columns).reset_index()
        df[time_var] = time
        df[columns] = scaler.fit_transform(df[columns].values)
        w0 = W.from_dataframe(df)
        w1 = KNN.from_dataframe(df, k=1)
        ws = [w0, w1]
        # the rescalar can create nans if a column has no variance, so fill with 0
        df = df.fillna(0)

        if threshold_variable and threshold_variable != "count":
            data[threshold_variable] = gdf[threshold_variable]
            threshold_var = data.threshold_variable.values
            ws[0] = attach_islands(ws[0], ws[1])

        elif threshold_variable == "count":
            threshold_var = np.ones(len(data.loc[time]))
            ws[0] = attach_islands(ws[0], ws[1])

        else:
            threshold_var = None

        model = specification[method](
            df[columns],
            w=ws[0],
            n_clusters=n_clusters,
            threshold_variable=threshold_var,
            threshold=threshold,
            **kwargs,
        )

        labels = model.labels_.astype(str)
        clusters = pd.DataFrame({
            method: labels,
            time_var: df[time_var],
            id_var: df[id_var]
        })
        clusters.set_index([time_var, id_var], inplace=True)
        dfs.append(gdf.loc[time].join(clusters, how="left"))
    gdf = pd.concat(dfs).reset_index()
    if return_model:
        return gdf, model
    return gdf