def _area_interpolate( source_df, target_df, extensive_variables=None, intensive_variables=None, tables=None, allocate_total=True, ): """ Area interpolation for extensive and intensive variables. Parameters ---------- source_df : geopandas.GeoDataFrame (required) geodataframe with polygon geometries target_df : geopandas.GeoDataFrame (required) geodataframe with polygon geometries extensive_variables : list, (optional) columns in dataframes for extensive variables intensive_variables : list, (optional) columns in dataframes for intensive variables tables : tuple (optional) two 2-D numpy arrays SU: area of intersection of source geometry i with union geometry j UT: binary mapping of union geometry j to target geometry t allocate_total : boolean True if total value of source area should be allocated. False if denominator is area of i. Note that the two cases would be identical when the area of the source polygon is exhausted by intersections. See Notes for more details. Returns ------- estimates : geopandas.GeoDataFrame new geodaraframe with interpolated variables as columns and target_df geometry as output geometry Notes ----- The assumption is both dataframes have the same coordinate reference system. For an extensive variable, the estimate at target polygon j (default case) is: v_j = \sum_i v_i w_{i,j} w_{i,j} = a_{i,j} / \sum_k a_{i,k} If the area of the source polygon is not exhausted by intersections with target polygons and there is reason to not allocate the complete value of an extensive attribute, then setting allocate_total=False will use the following weights: v_j = \sum_i v_i w_{i,j} w_{i,j} = a_{i,j} / a_i where a_i is the total area of source polygon i. For an intensive variable, the estimate at target polygon j is: v_j = \sum_i v_i w_{i,j} w_{i,j} = a_{i,j} / \sum_k a_{k,j} """ source_df = source_df.copy() target_df = target_df.copy() if _check_crs(source_df, target_df): pass else: return None if tables is None: SU, UT = _area_tables(source_df, target_df) else: SU, UT = tables den = source_df["geometry"].area.values if allocate_total: den = SU.sum(axis=1) den = den + (den == 0) weights = np.dot(np.diag(1 / den), SU) dfs = [] extensive = [] if extensive_variables: for variable in extensive_variables: vals = _nan_check(source_df, variable) vals = _inf_check(source_df, variable) estimates = np.dot(np.diag(vals), weights) estimates = np.dot(estimates, UT) estimates = estimates.sum(axis=0) extensive.append(estimates) extensive = np.array(extensive) extensive = pd.DataFrame(extensive.T, columns=extensive_variables) ST = np.dot(SU, UT) area = ST.sum(axis=0) den = np.diag(1.0 / (area + (area == 0))) weights = np.dot(ST, den) intensive = [] if intensive_variables: for variable in intensive_variables: vals = _nan_check(source_df, variable) vals = _inf_check(source_df, variable) vals.shape = (len(vals), 1) est = (vals * weights).sum(axis=0) intensive.append(est) intensive = np.array(intensive) intensive = pd.DataFrame(intensive.T, columns=intensive_variables) if extensive_variables: dfs.append(extensive) if intensive_variables: dfs.append(intensive) df = pd.concat(dfs, axis=1) df["geometry"] = target_df["geometry"].reset_index(drop=True) df = gpd.GeoDataFrame(df.replace(np.inf, np.nan)) return df
def _area_interpolate_binning( source_df, target_df, extensive_variables=None, intensive_variables=None, table=None, allocate_total=True, spatial_index="auto", n_jobs=1, categorical_variables=None, ): """ Area interpolation for extensive, intensive and categorical variables. Parameters ---------- source_df : geopandas.GeoDataFrame target_df : geopandas.GeoDataFrame extensive_variables : list [Optional. Default=None] Columns in dataframes for extensive variables intensive_variables : list [Optional. Default=None] Columns in dataframes for intensive variables table : scipy.sparse.dok_matrix [Optional. Default=None] Area allocation source-target correspondence table. If not provided, it will be built from `source_df` and `target_df` using `tobler.area_interpolate._area_tables_binning` allocate_total : boolean [Optional. Default=True] True if total value of source area should be allocated. False if denominator is area of i. Note that the two cases would be identical when the area of the source polygon is exhausted by intersections. See Notes for more details. spatial_index : str [Optional. Default="auto"] Spatial index to use to build the allocation of area from source to target tables. It currently support the following values: - "source": build the spatial index on `source_df` - "target": build the spatial index on `target_df` - "auto": attempts to guess the most efficient alternative. Currently, this option uses the largest table to build the index, and performs a `bulk_query` on the shorter table. This argument is ignored if n_jobs>1 (or n_jobs=-1). n_jobs : int [Optional. Default=1] Number of processes to run in parallel to generate the area allocation. If -1, this is set to the number of CPUs available. If `table` is passed, this is ignored. NOTE: as of Jan'21 multi-core functionality requires master versions of `pygeos` and `geopandas`. categorical_variables : list [Optional. Default=None] Columns in dataframes for categorical variables Returns ------- estimates : geopandas.GeoDataFrame new geodaraframe with interpolated variables as columns and target_df geometry as output geometry Notes ----- The assumption is both dataframes have the same coordinate reference system. For an extensive variable, the estimate at target polygon j (default case) is: .. math:: v_j = \\sum_i v_i w_{i,j} w_{i,j} = a_{i,j} / \\sum_k a_{i,k} If the area of the source polygon is not exhausted by intersections with target polygons and there is reason to not allocate the complete value of an extensive attribute, then setting allocate_total=False will use the following weights: .. math:: v_j = \\sum_i v_i w_{i,j} w_{i,j} = a_{i,j} / a_i where a_i is the total area of source polygon i. For an intensive variable, the estimate at target polygon j is: .. math:: v_j = \\sum_i v_i w_{i,j} w_{i,j} = a_{i,j} / \\sum_k a_{k,j} For categorical variables, the estimate returns ratio of presence of each unique category. """ source_df = source_df.copy() target_df = target_df.copy() if _check_crs(source_df, target_df): pass else: return None if table is None: if n_jobs == 1: table = _area_tables_binning(source_df, target_df, spatial_index) else: table = _area_tables_binning_parallel(source_df, target_df, n_jobs=n_jobs) den = source_df.area.values if allocate_total: den = np.asarray(table.sum(axis=1)) den = den + (den == 0) den = 1.0 / den n = den.shape[0] den = den.reshape((n, )) den = diags([den], [0]) weights = den.dot(table) # row standardize table dfs = [] extensive = [] if extensive_variables: for variable in extensive_variables: vals = _nan_check(source_df, variable) vals = _inf_check(source_df, variable) estimates = diags([vals], [0]).dot(weights) estimates = estimates.sum(axis=0) extensive.append(estimates.tolist()[0]) extensive = np.asarray(extensive) extensive = np.array(extensive) extensive = pd.DataFrame(extensive.T, columns=extensive_variables) area = np.asarray(table.sum(axis=0)) den = 1.0 / (area + (area == 0)) n, k = den.shape den = den.reshape((k, )) den = diags([den], [0]) weights = table.dot(den) intensive = [] if intensive_variables: for variable in intensive_variables: vals = _nan_check(source_df, variable) vals = _inf_check(source_df, variable) n = vals.shape[0] vals = vals.reshape((n, )) estimates = diags([vals], [0]) estimates = estimates.dot(weights).sum(axis=0) intensive.append(estimates.tolist()[0]) intensive = np.asarray(intensive) intensive = pd.DataFrame(intensive.T, columns=intensive_variables) if categorical_variables: categorical = {} for variable in categorical_variables: unique = source_df[variable].unique() for value in unique: mask = source_df[variable] == value categorical[f"{variable}_{value}"] = np.asarray( table[mask].sum(axis=0))[0] categorical = pd.DataFrame(categorical) categorical = categorical.div(target_df.area.values, axis="rows") if extensive_variables: dfs.append(extensive) if intensive_variables: dfs.append(intensive) if categorical_variables: dfs.append(categorical) df = pd.concat(dfs, axis=1) df["geometry"] = target_df[target_df.geometry.name].reset_index(drop=True) df = gpd.GeoDataFrame(df.replace(np.inf, np.nan)) return df
def _area_interpolate_binning( source_df, target_df, extensive_variables=None, intensive_variables=None, table=None, allocate_total=True, ): """ Area interpolation for extensive and intensive variables. Parameters ---------- source_df : geopandas.GeoDataFrame target_df : geopandas.GeoDataFrame extensive_variables : list columns in dataframes for extensive variables intensive_variables : list columns in dataframes for intensive variables table : scipy.sparse.dok_matrix allocate_total : boolean True if total value of source area should be allocated. False if denominator is area of i. Note that the two cases would be identical when the area of the source polygon is exhausted by intersections. See Notes for more details. Returns ------- estimates : geopandas.GeoDataFrame new geodaraframe with interpolated variables as columns and target_df geometry as output geometry Notes ----- The assumption is both dataframes have the same coordinate reference system. For an extensive variable, the estimate at target polygon j (default case) is: .. math:: v_j = \\sum_i v_i w_{i,j} w_{i,j} = a_{i,j} / \\sum_k a_{i,k} If the area of the source polygon is not exhausted by intersections with target polygons and there is reason to not allocate the complete value of an extensive attribute, then setting allocate_total=False will use the following weights: .. math:: v_j = \\sum_i v_i w_{i,j} w_{i,j} = a_{i,j} / a_i where a_i is the total area of source polygon i. For an intensive variable, the estimate at target polygon j is: .. math:: v_j = \\sum_i v_i w_{i,j} w_{i,j} = a_{i,j} / \\sum_k a_{k,j} """ source_df = source_df.copy() target_df = target_df.copy() if _check_crs(source_df, target_df): pass else: return None if table is None: table = _area_tables_binning(source_df, target_df) den = source_df["geometry"].area.values if allocate_total: den = np.asarray(table.sum(axis=1)) den = den + (den == 0) den = 1.0 / den n = den.shape[0] den = den.reshape((n, )) den = diags([den], [0]) weights = den.dot(table) # row standardize table dfs = [] extensive = [] if extensive_variables: for variable in extensive_variables: vals = _nan_check(source_df, variable) vals = _inf_check(source_df, variable) estimates = diags([vals], [0]).dot(weights) estimates = estimates.sum(axis=0) extensive.append(estimates.tolist()[0]) extensive = np.asarray(extensive) extensive = np.array(extensive) extensive = pd.DataFrame(extensive.T, columns=extensive_variables) area = np.asarray(table.sum(axis=0)) den = 1.0 / (area + (area == 0)) n, k = den.shape den = den.reshape((k, )) den = diags([den], [0]) weights = table.dot(den) intensive = [] if intensive_variables: for variable in intensive_variables: vals = _nan_check(source_df, variable) vals = _inf_check(source_df, variable) n = vals.shape[0] vals = vals.reshape((n, )) estimates = diags([vals], [0]) estimates = estimates.dot(weights).sum(axis=0) intensive.append(estimates.tolist()[0]) intensive = np.asarray(intensive) intensive = pd.DataFrame(intensive.T, columns=intensive_variables) if extensive_variables: dfs.append(extensive) if intensive_variables: dfs.append(intensive) df = pd.concat(dfs, axis=1) df["geometry"] = target_df["geometry"].reset_index(drop=True) df = gpd.GeoDataFrame(df.replace(np.inf, np.nan)) return df