def _perimeter_area_ratio_spatial_dissim(data, group_pop_var, total_pop_var, standardize=True): """ Calculation of Perimeter/Area Ratio Spatial Dissimilarity index Parameters ---------- data : a geopandas DataFrame with a geometry column. group_pop_var : string The name of variable in data that contains the population size of the group of interest total_pop_var : string The name of variable in data that contains the total population of the unit standardize : boolean A condition for standardisation of the weights matrices. If True, the values of cij in the formulas gets standardized and the overall sum is 1. Attributes ---------- statistic : float Perimeter/Area Ratio Spatial Dissimilarity Index core_data : a geopandas DataFrame A geopandas DataFrame that contains the columns used to perform the estimate. Notes ----- Based on Wong, David WS. "Spatial indices of segregation." Urban studies 30.3 (1993): 559-572. """ if (str(type(data)) != '<class \'geopandas.geodataframe.GeoDataFrame\'>'): raise TypeError( 'data is not a GeoDataFrame and, therefore, this index cannot be calculated.' ) if ('geometry' not in data.columns): data['geometry'] = data[data._geometry_column_name] data = data.drop([data._geometry_column_name], axis=1) data = data.set_geometry('geometry') if (type(standardize) is not bool): raise TypeError('std is not a boolean object') D = _dissim(data, group_pop_var, total_pop_var)[0] data = data.rename(columns={ group_pop_var: 'group_pop_var', total_pop_var: 'total_pop_var' }) # If a unit has zero population, the group of interest frequency is zero data = data.assign( pi=np.where(data.total_pop_var == 0, 0, data.group_pop_var / data.total_pop_var)) if not standardize: cij = _return_length_weighted_w(data).full()[0] else: cij = _return_length_weighted_w(data).full()[0] cij = cij / cij.sum() peri = data.length ai = data.area aux_sum = np.add( np.array(list((peri / ai))), np.array(list((peri / ai))).reshape((len(list((peri / ai))), 1))) max_pa = max(peri / ai) num = np.multiply(np.multiply(manhattan_distances(data[['pi']]), cij), aux_sum).sum() den = 4 * max_pa PARD = D - (num / den) PARD core_data = data[['group_pop_var', 'total_pop_var', 'geometry']] return PARD, core_data
def _bias_corrected_dissim(data, group_pop_var, total_pop_var, B=500): """ Calculation of Bias Corrected Dissimilarity index Parameters ---------- data : a pandas DataFrame group_pop_var : string The name of variable in data that contains the population size of the group of interest total_pop_var : string The name of variable in data that contains the total population of the unit B : int The number of iterations to calculate Dissimilarity simulating randomness with multinomial distributions. Default value is 500. Attributes ---------- statistic : float Dissimilarity with Bias-Correction (bias correction from Allen, Rebecca et al. (2015)) core_data : a pandas DataFrame A pandas DataFrame that contains the columns used to perform the estimate. Notes ----- Based on Allen, Rebecca, et al. "More reliable inference for the dissimilarity index of segregation." The econometrics journal 18.1 (2015): 40-66. """ if (type(B) is not int): raise TypeError('B must be an integer') if (B < 2): raise TypeError('B must be greater than 1.') D = _dissim(data, group_pop_var, total_pop_var)[0] data = data.rename(columns={ group_pop_var: 'group_pop_var', total_pop_var: 'total_pop_var' }) x = np.array(data.group_pop_var) t = np.array(data.total_pop_var) other_group_pop = t - x # Group 0: minority group p0_i = x / x.sum() n0 = x.sum() sim0 = np.random.multinomial(n0, p0_i, size=B) # Group 1: complement group p1_i = other_group_pop / other_group_pop.sum() n1 = other_group_pop.sum() sim1 = np.random.multinomial(n1, p1_i, size=B) Dbcs = np.empty(B) for i in np.array(range(B)): data_aux = { 'simul_group': sim0[i].tolist(), 'simul_tot': (sim0[i] + sim1[i]).tolist() } df_aux = pd.DataFrame.from_dict(data_aux) Dbcs[i] = _dissim(df_aux, 'simul_group', 'simul_tot')[0] Db = Dbcs.mean() Dbc = 2 * D - Db Dbc # It expected to be lower than D, because D is upwarded biased core_data = data[['group_pop_var', 'total_pop_var']] return Dbc, core_data
def _spatial_dissim(data, group_pop_var, total_pop_var, w = None, standardize = False): """ Calculation of Spatial Dissimilarity index Parameters ---------- data : a geopandas DataFrame with a geometry column. group_pop_var : string The name of variable in data that contains the population size of the group of interest total_pop_var : string The name of variable in data that contains the total population of the unit w : W A PySAL weights object. If not provided, Queen contiguity matrix is used. standardize : boolean A condition for row standardisation of the weights matrices. If True, the values of cij in the formulas gets row standardized. For the sake of comparison, the seg R package of Hong, Seong-Yun, David O'Sullivan, and Yukio Sadahiro. "Implementing spatial segregation measures in R." PloS one 9.11 (2014): e113767. works by default with row standardization. Attributes ---------- statistic : float Spatial Dissimilarity Index core_data : a geopandas DataFrame A geopandas DataFrame that contains the columns used to perform the estimate. Notes ----- Based on Morrill, R. L. (1991) "On the Measure of Geographic Segregation". Geography Research Forum. """ if (str(type(data)) != '<class \'geopandas.geodataframe.GeoDataFrame\'>'): raise TypeError('data is not a GeoDataFrame and, therefore, this index cannot be calculated.') if ('geometry' not in data.columns): data['geometry'] = data[data._geometry_column_name] data = data.drop([data._geometry_column_name], axis = 1) data = data.set_geometry('geometry') if (type(standardize) is not bool): raise TypeError('std is not a boolean object') if w is None: w_object = Queen.from_dataframe(data) else: w_object = w if (not issubclass(type(w_object), libpysal.weights.W)): raise TypeError('w is not a PySAL weights object') D = _dissim(data, group_pop_var, total_pop_var)[0] data = data.rename(columns={group_pop_var: 'group_pop_var', total_pop_var: 'total_pop_var'}) x = np.array(data.group_pop_var) t = np.array(data.total_pop_var) # If a unit has zero population, the group of interest frequency is zero pi = np.where(t == 0, 0, x / t) if not standardize: cij = w_object.full()[0] else: cij = w_object.full()[0] cij = cij / cij.sum(axis = 1).reshape((cij.shape[0], 1)) # Inspired in (second solution): https://stackoverflow.com/questions/22720864/efficiently-calculating-a-euclidean-distance-matrix-using-numpy # Distance Matrix abs_dist = abs(pi[..., np.newaxis] - pi) # manhattan_distances used to compute absolute distances num = np.multiply(abs_dist, cij).sum() den = cij.sum() SD = D - num / den SD core_data = data[['group_pop_var', 'total_pop_var', 'geometry']] return SD, core_data
def _boundary_spatial_dissim(data, group_pop_var, total_pop_var, standardize=False): """ Calculation of Boundary Spatial Dissimilarity index Parameters ---------- data : a geopandas DataFrame with a geometry column. group_pop_var : string The name of variable in data that contains the population size of the group of interest total_pop_var : string The name of variable in data that contains the total population of the unit standardize : boolean A condition for row standardisation of the weights matrices. If True, the values of cij in the formulas gets row standardized. For the sake of comparison, the seg R package of Hong, Seong-Yun, David O'Sullivan, and Yukio Sadahiro. "Implementing spatial segregation measures in R." PloS one 9.11 (2014): e113767. works by default without row standardization. That is, directly with border length. Attributes ---------- statistic : float Boundary Spatial Dissimilarity Index core_data : a geopandas DataFrame A geopandas DataFrame that contains the columns used to perform the estimate. Notes ----- The formula is based on Hong, Seong-Yun, David O'Sullivan, and Yukio Sadahiro. "Implementing spatial segregation measures in R." PloS one 9.11 (2014): e113767. Original paper by Wong, David WS. "Spatial indices of segregation." Urban studies 30.3 (1993): 559-572. """ if (str(type(data)) != '<class \'geopandas.geodataframe.GeoDataFrame\'>'): raise TypeError( 'data is not a GeoDataFrame and, therefore, this index cannot be calculated.' ) if ('geometry' not in data.columns): data['geometry'] = data[data._geometry_column_name] data = data.drop([data._geometry_column_name], axis=1) data = data.set_geometry('geometry') if (type(standardize) is not bool): raise TypeError('std is not a boolean object') D = _dissim(data, group_pop_var, total_pop_var)[0] data = data.rename(columns={ group_pop_var: 'group_pop_var', total_pop_var: 'total_pop_var' }) # If a unit has zero population, the group of interest frequency is zero data = data.assign( pi=np.where(data.total_pop_var == 0, 0, data.group_pop_var / data.total_pop_var)) if not standardize: cij = _return_length_weighted_w(data).full()[0] else: cij = _return_length_weighted_w(data).full()[0] cij = cij / cij.sum(axis=1).reshape((cij.shape[0], 1)) # manhattan_distances used to compute absolute distances num = np.multiply(manhattan_distances(data[['pi']]), cij).sum() den = cij.sum() BSD = D - num / den BSD core_data = data[['group_pop_var', 'total_pop_var', 'geometry']] return BSD, core_data
def _modified_dissim(data, group_pop_var, total_pop_var, iterations = 500): """ Calculation of Modified Dissimilarity index Parameters ---------- data : a pandas DataFrame group_pop_var : string The name of variable in data that contains the population size of the group of interest total_pop_var : string The name of variable in data that contains the total population of the unit iterations : int The number of iterations the evaluate average classic dissimilarity under eveness. Default value is 500. Attributes ---------- statistic : float Modified Dissimilarity Index (Dissimilarity from Carrington and Troske (1997)) core_data : a pandas DataFrame A pandas DataFrame that contains the columns used to perform the estimate. Notes ----- Based on Carrington, William J., and Kenneth R. Troske. "On measuring segregation in samples with small units." Journal of Business & Economic Statistics 15.4 (1997): 402-409. """ if(type(iterations) is not int): raise TypeError('iterations must be an integer') if(iterations < 2): raise TypeError('iterations must be greater than 1.') D = _dissim(data, group_pop_var, total_pop_var)[0] data = data.rename(columns={group_pop_var: 'group_pop_var', total_pop_var: 'total_pop_var'}) # core_data has to be in the beggining of the call because assign methods will be used later core_data = data[['group_pop_var', 'total_pop_var']] x = np.array(data.group_pop_var) t = np.array(data.total_pop_var) p_null = x.sum() / t.sum() Ds = np.empty(iterations) for i in np.array(range(iterations)): freq_sim = np.random.binomial(n = np.array([t.tolist()]), p = np.array([[p_null] * data.shape[0]]), size = (1, data.shape[0])).tolist()[0] data = data.assign(group_pop_var = freq_sim) aux = _dissim(data, 'group_pop_var', 'total_pop_var')[0] Ds[i] = aux D_star = Ds.mean() if (D >= D_star): Dct = (D - D_star)/(1 - D_star) else: Dct = (D - D_star)/D_star return Dct, core_data