コード例 #1
def _perimeter_area_ratio_spatial_dissim(data,
    Calculation of Perimeter/Area Ratio Spatial Dissimilarity index


    data          : a geopandas DataFrame with a geometry column.
    group_pop_var : string
                    The name of variable in data that contains the population size of the group of interest
    total_pop_var : string
                    The name of variable in data that contains the total population of the unit
    standardize   : boolean
                    A condition for standardisation of the weights matrices. 
                    If True, the values of cij in the formulas gets standardized and the overall sum is 1.


    statistic : float
                Perimeter/Area Ratio Spatial Dissimilarity Index
    core_data : a geopandas DataFrame
                A geopandas DataFrame that contains the columns used to perform the estimate.
    Based on Wong, David WS. "Spatial indices of segregation." Urban studies 30.3 (1993): 559-572.


    if (str(type(data)) != '<class \'geopandas.geodataframe.GeoDataFrame\'>'):
        raise TypeError(
            'data is not a GeoDataFrame and, therefore, this index cannot be calculated.'

    if ('geometry' not in data.columns):
        data['geometry'] = data[data._geometry_column_name]
        data = data.drop([data._geometry_column_name], axis=1)
        data = data.set_geometry('geometry')

    if (type(standardize) is not bool):
        raise TypeError('std is not a boolean object')

    D = _dissim(data, group_pop_var, total_pop_var)[0]

    data = data.rename(columns={
        group_pop_var: 'group_pop_var',
        total_pop_var: 'total_pop_var'

    # If a unit has zero population, the group of interest frequency is zero
    data = data.assign(
        pi=np.where(data.total_pop_var == 0, 0, data.group_pop_var /

    if not standardize:
        cij = _return_length_weighted_w(data).full()[0]
        cij = _return_length_weighted_w(data).full()[0]
        cij = cij / cij.sum()

    peri = data.length
    ai = data.area

    aux_sum = np.add(
        np.array(list((peri / ai))),
        np.array(list((peri / ai))).reshape((len(list((peri / ai))), 1)))

    max_pa = max(peri / ai)

    num = np.multiply(np.multiply(manhattan_distances(data[['pi']]), cij),
    den = 4 * max_pa

    PARD = D - (num / den)

    core_data = data[['group_pop_var', 'total_pop_var', 'geometry']]

    return PARD, core_data
コード例 #2
def _bias_corrected_dissim(data, group_pop_var, total_pop_var, B=500):
    Calculation of Bias Corrected Dissimilarity index


    data          : a pandas DataFrame
    group_pop_var : string
                    The name of variable in data that contains the population size of the group of interest
    total_pop_var : string
                    The name of variable in data that contains the total population of the unit
    B             : int
                    The number of iterations to calculate Dissimilarity simulating randomness with multinomial distributions. Default value is 500.


    statistic : float
                Dissimilarity with Bias-Correction (bias correction from Allen, Rebecca et al. (2015))
    core_data : a pandas DataFrame
                A pandas DataFrame that contains the columns used to perform the estimate. 

    Based on Allen, Rebecca, et al. "More reliable inference for the dissimilarity index of segregation." The econometrics journal 18.1 (2015): 40-66.

    if (type(B) is not int):
        raise TypeError('B must be an integer')

    if (B < 2):
        raise TypeError('B must be greater than 1.')

    D = _dissim(data, group_pop_var, total_pop_var)[0]

    data = data.rename(columns={
        group_pop_var: 'group_pop_var',
        total_pop_var: 'total_pop_var'

    x = np.array(data.group_pop_var)
    t = np.array(data.total_pop_var)

    other_group_pop = t - x

    # Group 0: minority group
    p0_i = x / x.sum()
    n0 = x.sum()
    sim0 = np.random.multinomial(n0, p0_i, size=B)

    # Group 1: complement group
    p1_i = other_group_pop / other_group_pop.sum()
    n1 = other_group_pop.sum()
    sim1 = np.random.multinomial(n1, p1_i, size=B)

    Dbcs = np.empty(B)
    for i in np.array(range(B)):
        data_aux = {
            'simul_group': sim0[i].tolist(),
            'simul_tot': (sim0[i] + sim1[i]).tolist()
        df_aux = pd.DataFrame.from_dict(data_aux)
        Dbcs[i] = _dissim(df_aux, 'simul_group', 'simul_tot')[0]

    Db = Dbcs.mean()

    Dbc = 2 * D - Db
    Dbc  # It expected to be lower than D, because D is upwarded biased

    core_data = data[['group_pop_var', 'total_pop_var']]

    return Dbc, core_data
コード例 #3
def _spatial_dissim(data, group_pop_var, total_pop_var, w = None, standardize = False):
    Calculation of Spatial Dissimilarity index


    data          : a geopandas DataFrame with a geometry column.
    group_pop_var : string
                    The name of variable in data that contains the population size of the group of interest
    total_pop_var : string
                    The name of variable in data that contains the total population of the unit
    w             : W
                    A PySAL weights object. If not provided, Queen contiguity matrix is used.
    standardize   : boolean
                    A condition for row standardisation of the weights matrices. If True, the values of cij in the formulas gets row standardized.
                    For the sake of comparison, the seg R package of Hong, Seong-Yun, David O'Sullivan, and Yukio Sadahiro. "Implementing spatial segregation measures in R." PloS one 9.11 (2014): e113767.
                    works by default with row standardization.


    statistic : float
                Spatial Dissimilarity Index
    core_data : a geopandas DataFrame
                A geopandas DataFrame that contains the columns used to perform the estimate.
    Based on Morrill, R. L. (1991) "On the Measure of Geographic Segregation". Geography Research Forum.

    if (str(type(data)) != '<class \'geopandas.geodataframe.GeoDataFrame\'>'):
        raise TypeError('data is not a GeoDataFrame and, therefore, this index cannot be calculated.')
    if ('geometry' not in data.columns):
        data['geometry'] = data[data._geometry_column_name]
        data = data.drop([data._geometry_column_name], axis = 1)
        data = data.set_geometry('geometry')
    if (type(standardize) is not bool):
        raise TypeError('std is not a boolean object')
    if w is None:    
        w_object = Queen.from_dataframe(data)
        w_object = w
    if (not issubclass(type(w_object), libpysal.weights.W)):
        raise TypeError('w is not a PySAL weights object')
    D = _dissim(data, group_pop_var, total_pop_var)[0]
    data = data.rename(columns={group_pop_var: 'group_pop_var', 
                                total_pop_var: 'total_pop_var'})
    x = np.array(data.group_pop_var)
    t = np.array(data.total_pop_var)
    # If a unit has zero population, the group of interest frequency is zero
    pi = np.where(t == 0, 0, x / t)
    if not standardize:
        cij = w_object.full()[0]
        cij = w_object.full()[0]
        cij = cij / cij.sum(axis = 1).reshape((cij.shape[0], 1))

    # Inspired in (second solution): https://stackoverflow.com/questions/22720864/efficiently-calculating-a-euclidean-distance-matrix-using-numpy
    # Distance Matrix
    abs_dist = abs(pi[..., np.newaxis] - pi)
    # manhattan_distances used to compute absolute distances
    num = np.multiply(abs_dist, cij).sum()
    den = cij.sum()
    SD = D - num / den
    core_data = data[['group_pop_var', 'total_pop_var', 'geometry']]
    return SD, core_data
コード例 #4
def _boundary_spatial_dissim(data,
    Calculation of Boundary Spatial Dissimilarity index


    data          : a geopandas DataFrame with a geometry column.
    group_pop_var : string
                    The name of variable in data that contains the population size of the group of interest
    total_pop_var : string
                    The name of variable in data that contains the total population of the unit
    standardize   : boolean
                    A condition for row standardisation of the weights matrices. If True, the values of cij in the formulas gets row standardized.
                    For the sake of comparison, the seg R package of Hong, Seong-Yun, David O'Sullivan, and Yukio Sadahiro. "Implementing spatial segregation measures in R." PloS one 9.11 (2014): e113767.
                    works by default without row standardization. That is, directly with border length.


    statistic : float
                Boundary Spatial Dissimilarity Index
    core_data : a geopandas DataFrame
                A geopandas DataFrame that contains the columns used to perform the estimate.
    The formula is based on Hong, Seong-Yun, David O'Sullivan, and Yukio Sadahiro. "Implementing spatial segregation measures in R." PloS one 9.11 (2014): e113767.
    Original paper by Wong, David WS. "Spatial indices of segregation." Urban studies 30.3 (1993): 559-572.


    if (str(type(data)) != '<class \'geopandas.geodataframe.GeoDataFrame\'>'):
        raise TypeError(
            'data is not a GeoDataFrame and, therefore, this index cannot be calculated.'

    if ('geometry' not in data.columns):
        data['geometry'] = data[data._geometry_column_name]
        data = data.drop([data._geometry_column_name], axis=1)
        data = data.set_geometry('geometry')

    if (type(standardize) is not bool):
        raise TypeError('std is not a boolean object')

    D = _dissim(data, group_pop_var, total_pop_var)[0]

    data = data.rename(columns={
        group_pop_var: 'group_pop_var',
        total_pop_var: 'total_pop_var'

    # If a unit has zero population, the group of interest frequency is zero
    data = data.assign(
        pi=np.where(data.total_pop_var == 0, 0, data.group_pop_var /

    if not standardize:
        cij = _return_length_weighted_w(data).full()[0]
        cij = _return_length_weighted_w(data).full()[0]
        cij = cij / cij.sum(axis=1).reshape((cij.shape[0], 1))

    # manhattan_distances used to compute absolute distances
    num = np.multiply(manhattan_distances(data[['pi']]), cij).sum()
    den = cij.sum()
    BSD = D - num / den

    core_data = data[['group_pop_var', 'total_pop_var', 'geometry']]

    return BSD, core_data
コード例 #5
def _modified_dissim(data, group_pop_var, total_pop_var, iterations = 500):
    Calculation of Modified Dissimilarity index


    data          : a pandas DataFrame
    group_pop_var : string
                    The name of variable in data that contains the population size of the group of interest
    total_pop_var : string
                    The name of variable in data that contains the total population of the unit
    iterations    : int
                    The number of iterations the evaluate average classic dissimilarity under eveness. Default value is 500.


    statistic : float
                Modified Dissimilarity Index (Dissimilarity from Carrington and Troske (1997))
    core_data : a pandas DataFrame
                A pandas DataFrame that contains the columns used to perform the estimate. 
    Based on Carrington, William J., and Kenneth R. Troske. "On measuring segregation in samples with small units." Journal of Business & Economic Statistics 15.4 (1997): 402-409.

    if(type(iterations) is not int):
        raise TypeError('iterations must be an integer')
    if(iterations < 2):
        raise TypeError('iterations must be greater than 1.')
    D = _dissim(data, group_pop_var, total_pop_var)[0]
    data = data.rename(columns={group_pop_var: 'group_pop_var', 
                                total_pop_var: 'total_pop_var'})
    # core_data has to be in the beggining of the call because assign methods will be used later
    core_data = data[['group_pop_var', 'total_pop_var']]
    x = np.array(data.group_pop_var)
    t = np.array(data.total_pop_var)
    p_null = x.sum() / t.sum()
    Ds = np.empty(iterations)
    for i in np.array(range(iterations)):

        freq_sim = np.random.binomial(n = np.array([t.tolist()]), 
                                      p = np.array([[p_null] * data.shape[0]]), 
                                      size = (1, data.shape[0])).tolist()[0]
        data = data.assign(group_pop_var = freq_sim)
        aux = _dissim(data, 'group_pop_var', 'total_pop_var')[0]
        Ds[i] = aux
    D_star = Ds.mean()
    if (D >= D_star):
        Dct = (D - D_star)/(1 - D_star)
        Dct = (D - D_star)/D_star
    return Dct, core_data