Python _dissimの例、segregation.dissimilarity._dissim Pythonの例

コード例 #1

0

ファイルを表示

def _perimeter_area_ratio_spatial_dissim(data,
                                         group_pop_var,
                                         total_pop_var,
                                         standardize=True):
    """
    Calculation of Perimeter/Area Ratio Spatial Dissimilarity index

    Parameters
    ----------

    data          : a geopandas DataFrame with a geometry column.
    
    group_pop_var : string
                    The name of variable in data that contains the population size of the group of interest
                    
    total_pop_var : string
                    The name of variable in data that contains the total population of the unit
                    
    standardize   : boolean
                    A condition for standardisation of the weights matrices. 
                    If True, the values of cij in the formulas gets standardized and the overall sum is 1.

    Attributes
    ----------

    statistic : float
                Perimeter/Area Ratio Spatial Dissimilarity Index
                
    core_data : a geopandas DataFrame
                A geopandas DataFrame that contains the columns used to perform the estimate.
                
    Notes
    -----
    Based on Wong, David WS. "Spatial indices of segregation." Urban studies 30.3 (1993): 559-572.

    """

    if (str(type(data)) != '<class \'geopandas.geodataframe.GeoDataFrame\'>'):
        raise TypeError(
            'data is not a GeoDataFrame and, therefore, this index cannot be calculated.'
        )

    if ('geometry' not in data.columns):
        data['geometry'] = data[data._geometry_column_name]
        data = data.drop([data._geometry_column_name], axis=1)
        data = data.set_geometry('geometry')

    if (type(standardize) is not bool):
        raise TypeError('std is not a boolean object')

    D = _dissim(data, group_pop_var, total_pop_var)[0]

    data = data.rename(columns={
        group_pop_var: 'group_pop_var',
        total_pop_var: 'total_pop_var'
    })

    # If a unit has zero population, the group of interest frequency is zero
    data = data.assign(
        pi=np.where(data.total_pop_var == 0, 0, data.group_pop_var /
                    data.total_pop_var))

    if not standardize:
        cij = _return_length_weighted_w(data).full()[0]
    else:
        cij = _return_length_weighted_w(data).full()[0]
        cij = cij / cij.sum()

    peri = data.length
    ai = data.area

    aux_sum = np.add(
        np.array(list((peri / ai))),
        np.array(list((peri / ai))).reshape((len(list((peri / ai))), 1)))

    max_pa = max(peri / ai)

    num = np.multiply(np.multiply(manhattan_distances(data[['pi']]), cij),
                      aux_sum).sum()
    den = 4 * max_pa

    PARD = D - (num / den)
    PARD

    core_data = data[['group_pop_var', 'total_pop_var', 'geometry']]

    return PARD, core_data

コード例 #2

0

ファイルを表示

ファイル: bias_corrected_dissimilarity.py プロジェクト: ljwolf/segregation

def _bias_corrected_dissim(data, group_pop_var, total_pop_var, B=500):
    """
    Calculation of Bias Corrected Dissimilarity index

    Parameters
    ----------

    data          : a pandas DataFrame
    
    group_pop_var : string
                    The name of variable in data that contains the population size of the group of interest
                    
    total_pop_var : string
                    The name of variable in data that contains the total population of the unit
                    
    B             : int
                    The number of iterations to calculate Dissimilarity simulating randomness with multinomial distributions. Default value is 500.

    Attributes
    ----------

    statistic : float
                Dissimilarity with Bias-Correction (bias correction from Allen, Rebecca et al. (2015))
                
    core_data : a pandas DataFrame
                A pandas DataFrame that contains the columns used to perform the estimate. 

    Notes
    -----
    Based on Allen, Rebecca, et al. "More reliable inference for the dissimilarity index of segregation." The econometrics journal 18.1 (2015): 40-66.

    """
    if (type(B) is not int):
        raise TypeError('B must be an integer')

    if (B < 2):
        raise TypeError('B must be greater than 1.')

    D = _dissim(data, group_pop_var, total_pop_var)[0]

    data = data.rename(columns={
        group_pop_var: 'group_pop_var',
        total_pop_var: 'total_pop_var'
    })

    x = np.array(data.group_pop_var)
    t = np.array(data.total_pop_var)

    other_group_pop = t - x

    # Group 0: minority group
    p0_i = x / x.sum()
    n0 = x.sum()
    sim0 = np.random.multinomial(n0, p0_i, size=B)

    # Group 1: complement group
    p1_i = other_group_pop / other_group_pop.sum()
    n1 = other_group_pop.sum()
    sim1 = np.random.multinomial(n1, p1_i, size=B)

    Dbcs = np.empty(B)
    for i in np.array(range(B)):
        data_aux = {
            'simul_group': sim0[i].tolist(),
            'simul_tot': (sim0[i] + sim1[i]).tolist()
        }
        df_aux = pd.DataFrame.from_dict(data_aux)
        Dbcs[i] = _dissim(df_aux, 'simul_group', 'simul_tot')[0]

    Db = Dbcs.mean()

    Dbc = 2 * D - Db
    Dbc  # It expected to be lower than D, because D is upwarded biased

    core_data = data[['group_pop_var', 'total_pop_var']]

    return Dbc, core_data

コード例 #3

0

ファイルを表示

ファイル: spatial_dissimilarity.py プロジェクト: ljwolf/segregation

def _spatial_dissim(data, group_pop_var, total_pop_var, w = None, standardize = False):
    """
    Calculation of Spatial Dissimilarity index

    Parameters
    ----------

    data          : a geopandas DataFrame with a geometry column.
    
    group_pop_var : string
                    The name of variable in data that contains the population size of the group of interest
                    
    total_pop_var : string
                    The name of variable in data that contains the total population of the unit
                    
    w             : W
                    A PySAL weights object. If not provided, Queen contiguity matrix is used.
                    
    standardize   : boolean
                    A condition for row standardisation of the weights matrices. If True, the values of cij in the formulas gets row standardized.
                    For the sake of comparison, the seg R package of Hong, Seong-Yun, David O'Sullivan, and Yukio Sadahiro. "Implementing spatial segregation measures in R." PloS one 9.11 (2014): e113767.
                    works by default with row standardization.
        

    Attributes
    ----------

    statistic : float
                Spatial Dissimilarity Index
                
    core_data : a geopandas DataFrame
                A geopandas DataFrame that contains the columns used to perform the estimate.
                
    Notes
    -----
    Based on Morrill, R. L. (1991) "On the Measure of Geographic Segregation". Geography Research Forum.

    """
    if (str(type(data)) != '<class \'geopandas.geodataframe.GeoDataFrame\'>'):
        raise TypeError('data is not a GeoDataFrame and, therefore, this index cannot be calculated.')
        
    if ('geometry' not in data.columns):
        data['geometry'] = data[data._geometry_column_name]
        data = data.drop([data._geometry_column_name], axis = 1)
        data = data.set_geometry('geometry')
        
    if (type(standardize) is not bool):
        raise TypeError('std is not a boolean object')
        
    if w is None:    
        w_object = Queen.from_dataframe(data)
    else:
        w_object = w
    
    if (not issubclass(type(w_object), libpysal.weights.W)):
        raise TypeError('w is not a PySAL weights object')
    
    D = _dissim(data, group_pop_var, total_pop_var)[0]
    
    data = data.rename(columns={group_pop_var: 'group_pop_var', 
                                total_pop_var: 'total_pop_var'})
    
    x = np.array(data.group_pop_var)
    t = np.array(data.total_pop_var)
    
    # If a unit has zero population, the group of interest frequency is zero
    pi = np.where(t == 0, 0, x / t)
    
    if not standardize:
        cij = w_object.full()[0]
    else:
        cij = w_object.full()[0]
        cij = cij / cij.sum(axis = 1).reshape((cij.shape[0], 1))

    # Inspired in (second solution): https://stackoverflow.com/questions/22720864/efficiently-calculating-a-euclidean-distance-matrix-using-numpy
    # Distance Matrix
    abs_dist = abs(pi[..., np.newaxis] - pi)
        
    # manhattan_distances used to compute absolute distances
    num = np.multiply(abs_dist, cij).sum()
    den = cij.sum()
    SD = D - num / den
    SD
    
    core_data = data[['group_pop_var', 'total_pop_var', 'geometry']]
    
    return SD, core_data

コード例 #4

0

ファイルを表示

ファイル: boundary_spatial_dissimilarity.py プロジェクト: ljwolf/segregation

def _boundary_spatial_dissim(data,
                             group_pop_var,
                             total_pop_var,
                             standardize=False):
    """
    Calculation of Boundary Spatial Dissimilarity index

    Parameters
    ----------

    data          : a geopandas DataFrame with a geometry column.
    
    group_pop_var : string
                    The name of variable in data that contains the population size of the group of interest
                    
    total_pop_var : string
                    The name of variable in data that contains the total population of the unit
                    
    standardize   : boolean
                    A condition for row standardisation of the weights matrices. If True, the values of cij in the formulas gets row standardized.
                    For the sake of comparison, the seg R package of Hong, Seong-Yun, David O'Sullivan, and Yukio Sadahiro. "Implementing spatial segregation measures in R." PloS one 9.11 (2014): e113767.
                    works by default without row standardization. That is, directly with border length.
        

    Attributes
    ----------

    statistic : float
                Boundary Spatial Dissimilarity Index
                
    core_data : a geopandas DataFrame
                A geopandas DataFrame that contains the columns used to perform the estimate.
                
    Notes
    -----
    The formula is based on Hong, Seong-Yun, David O'Sullivan, and Yukio Sadahiro. "Implementing spatial segregation measures in R." PloS one 9.11 (2014): e113767.
    
    Original paper by Wong, David WS. "Spatial indices of segregation." Urban studies 30.3 (1993): 559-572.

    """

    if (str(type(data)) != '<class \'geopandas.geodataframe.GeoDataFrame\'>'):
        raise TypeError(
            'data is not a GeoDataFrame and, therefore, this index cannot be calculated.'
        )

    if ('geometry' not in data.columns):
        data['geometry'] = data[data._geometry_column_name]
        data = data.drop([data._geometry_column_name], axis=1)
        data = data.set_geometry('geometry')

    if (type(standardize) is not bool):
        raise TypeError('std is not a boolean object')

    D = _dissim(data, group_pop_var, total_pop_var)[0]

    data = data.rename(columns={
        group_pop_var: 'group_pop_var',
        total_pop_var: 'total_pop_var'
    })

    # If a unit has zero population, the group of interest frequency is zero
    data = data.assign(
        pi=np.where(data.total_pop_var == 0, 0, data.group_pop_var /
                    data.total_pop_var))

    if not standardize:
        cij = _return_length_weighted_w(data).full()[0]
    else:
        cij = _return_length_weighted_w(data).full()[0]
        cij = cij / cij.sum(axis=1).reshape((cij.shape[0], 1))

    # manhattan_distances used to compute absolute distances
    num = np.multiply(manhattan_distances(data[['pi']]), cij).sum()
    den = cij.sum()
    BSD = D - num / den
    BSD

    core_data = data[['group_pop_var', 'total_pop_var', 'geometry']]

    return BSD, core_data

コード例 #5

0

ファイルを表示

ファイル: modified_dissimilarity.py プロジェクト: ljwolf/segregation

def _modified_dissim(data, group_pop_var, total_pop_var, iterations = 500):
    """
    Calculation of Modified Dissimilarity index

    Parameters
    ----------

    data          : a pandas DataFrame
    
    group_pop_var : string
                    The name of variable in data that contains the population size of the group of interest
                    
    total_pop_var : string
                    The name of variable in data that contains the total population of the unit
                    
    iterations    : int
                    The number of iterations the evaluate average classic dissimilarity under eveness. Default value is 500.

    Attributes
    ----------

    statistic : float
                Modified Dissimilarity Index (Dissimilarity from Carrington and Troske (1997))
                
    core_data : a pandas DataFrame
                A pandas DataFrame that contains the columns used to perform the estimate. 
                
    Notes
    -----
    Based on Carrington, William J., and Kenneth R. Troske. "On measuring segregation in samples with small units." Journal of Business & Economic Statistics 15.4 (1997): 402-409.

    """
    if(type(iterations) is not int):
        raise TypeError('iterations must be an integer')
        
    if(iterations < 2):
        raise TypeError('iterations must be greater than 1.')
   
    D = _dissim(data, group_pop_var, total_pop_var)[0]
    
    data = data.rename(columns={group_pop_var: 'group_pop_var', 
                                total_pop_var: 'total_pop_var'})
    
    # core_data has to be in the beggining of the call because assign methods will be used later
    core_data = data[['group_pop_var', 'total_pop_var']]
    
    x = np.array(data.group_pop_var)
    t = np.array(data.total_pop_var)
    
    p_null = x.sum() / t.sum()
    
    Ds = np.empty(iterations)
    
    for i in np.array(range(iterations)):

        freq_sim = np.random.binomial(n = np.array([t.tolist()]), 
                                      p = np.array([[p_null] * data.shape[0]]), 
                                      size = (1, data.shape[0])).tolist()[0]
        data = data.assign(group_pop_var = freq_sim)
        aux = _dissim(data, 'group_pop_var', 'total_pop_var')[0]
        Ds[i] = aux
        
    D_star = Ds.mean()
    
    if (D >= D_star):
        Dct = (D - D_star)/(1 - D_star)
    else:
        Dct = (D - D_star)/D_star
    
    return Dct, core_data