def dissimilarity(distribution, classes=None): """ Compute the inter-class dissimilarity index The dissimilarity index between two categories `\alpha` and `\beta` is defined as ..math:: D_{\alpha \beta} = \frac{1}{2} \sum_{i=1}^{T} \left| \frac{n_\alpha(t)}{N_\alpha} - \frac{n_\beta(t)}{N_\beta} \right| Its value ranges from 0 to 1. Parameters ---------- distribution: nested dictionaries Number of people per class, per areal unit as given in the raw data (ungrouped). The dictionary must have the following formatting: > {areal_id: {class_id: number}} classes: dictionary of lists When the original categories need to be aggregated into different classes. {class: [categories belonging to this class]} This can be arbitrarily imposed, or computed with uncover_classes function of this package. Returns ------- dissimilarity: nested dictionary Classes matrix with dissimilarity as values > {alpha: {beta: D_{\alpha \beta}}} """ ## Regroup into classes if specified if classes is not None: distribution = regroup_per_class(distribution, classes) else: classes = return_categories(distribution) ## Compute total numbers of individuals per class and areal unit N_unit, N_class, N_tot = compute_totals(distribution, classes) ## Compute the dissimilarity matrix # Only half of the values are computed (the matrix is symmetric) dissimilarity = collections.defaultdict(dict) for alpha, beta in itertools.combinations_with_replacement(classes, 2): dissimilarity[alpha][beta] = _pair_dissimilarity( distribution, N_class, alpha, beta) # Symmetrize the output for c0 in dissimilarity.iterkeys(): for c1 in dissimilarity[c0].iterkeys(): if c0 not in dissimilarity[c1]: dissimilarity[c1][c0] = dissimilarity[c0][c1] return dissimilarity
def representation(distribution, classes=None): """ Compute the representation of the different classes in all areal units Parameters ---------- distribution: nested dictionaries Number of people per class, per areal unit as given in the raw data (ungrouped). The dictionary must have the following formatting: > {areal_id: {class_id: number}} classes: dictionary of lists When the original categories need to be aggregated into different classes. > {class: [categories belonging to this class]} This can be arbitrarily imposed, or computed with uncover_classes function of this package. Returns ------- representation: nested dictionnaries Representation of each category in each areal unit. > {areal_id: {class_id: (representation_values, variance of the null model)}} """ # Regroup into classes if specified. Otherwise return categories indicated # in the data if classes: distribution = regroup_per_class(distribution, classes) else: classes = return_categories(distribution) # Compute the total numbers per class and per individual N_unit, N_class, N_tot = compute_totals(distribution, classes) # Compute the representation and standard deviation for all areal units representation = {au:{cl:(single_representation(dist_au[cl], N_unit[au], N_class[cl], N_tot), single_variance(N_unit[au], N_class[cl], N_tot) ) for cl in classes} for au, dist_au in distribution.iteritems()} return representation
def test_compute_totals(self): """ Compute totals """ city = fake_city() cat = return_categories(city) N_au, N_class, N_tot = compute_totals(city, cat) # Answers computed by hand Ntot_answer = 226 Nclass_answer = {1:0, 3:22, 4:37, 5:66, 6:9, 7:76, 8:16} Nau_answer = {"A":131, "B":95} # Test assert N_tot == Ntot_answer assert_equal(N_class, Nclass_answer) assert_equal(N_au, Nau_answer)
def test_compute_totals(self): """ Compute totals """ city = fake_city() cat = return_categories(city) N_au, N_class, N_tot = compute_totals(city, cat) # Answers computed by hand Ntot_answer = 226 Nclass_answer = {1: 0, 3: 22, 4: 37, 5: 66, 6: 9, 7: 76, 8: 16} Nau_answer = {"A": 131, "B": 95} # Test assert N_tot == Ntot_answer assert_equal(N_class, Nclass_answer) assert_equal(N_au, Nau_answer)
def representation(distribution, classes=None): """ Compute the representation of the different classes in all areal units Parameters ---------- distribution: nested dictionaries Number of people per class, per areal unit as given in the raw data (ungrouped). The dictionary must have the following formatting: > {areal_id: {class_id: number}} classes: dictionary of lists When the original categories need to be aggregated into different classes. > {class: [categories belonging to this class]} This can be arbitrarily imposed, or computed with uncover_classes function of this package. Returns ------- representation: nested dictionnaries Representation of each category in each areal unit. > {areal_id: {class_id: (representation_values, variance of the null model)}} """ # Regroup into classes if specified. Otherwise return categories indicated # in the data if classes: distribution = regroup_per_class(distribution, classes) else: classes = return_categories(distribution) # Compute the total numbers per class and per individual N_unit, N_class, N_tot = compute_totals(distribution, classes) # Compute the representation and standard deviation for all areal units representation = { au: { cl: (single_representation(dist_au[cl], N_unit[au], N_class[cl], N_tot), single_variance(N_unit[au], N_class[cl], N_tot)) for cl in classes } for au, dist_au in distribution.iteritems() } return representation
def dissimilarity(distribution, classes=None): """ Compute the inter-class dissimilarity index The dissimilarity index between two categories `\alpha` and `\beta` is defined as ..math:: D_{\alpha \beta} = \frac{1}{2} \sum_{i=1}^{T} \left| \frac{n_\alpha(t)}{N_\alpha} - \frac{n_\beta(t)}{N_\beta} \right| Its value ranges from 0 to 1. Parameters ---------- distribution: nested dictionaries Number of people per class, per areal unit as given in the raw data (ungrouped). The dictionary must have the following formatting: > {areal_id: {class_id: number}} classes: dictionary of lists When the original categories need to be aggregated into different classes. {class: [categories belonging to this class]} This can be arbitrarily imposed, or computed with uncover_classes function of this package. Returns ------- dissimilarity: nested dictionary Classes matrix with dissimilarity as values > {alpha: {beta: D_{\alpha \beta}}} """ ## Regroup into classes if specified if classes is not None: distribution = regroup_per_class(distribution, classes) else: classes = return_categories(distribution) ## Compute total numbers of individuals per class and areal unit N_unit, N_class, N_tot = compute_totals(distribution, classes) ## Compute the dissimilarity matrix # Only half of the values are computed (the matrix is symmetric) dissimilarity = collections.defaultdict(dict) for alpha, beta in itertools.combinations_with_replacement(classes, 2): dissimilarity[alpha][beta] = _pair_dissimilarity(distribution, N_class, alpha, beta) # Symmetrize the output for c0 in dissimilarity.iterkeys(): for c1 in dissimilarity[c0].iterkeys(): if c0 not in dissimilarity[c1]: dissimilarity[c1][c0] = dissimilarity[c0][c1] return dissimilarity
def cluster_categories(distribution, exposure): """ Perform hierarhical clustering on the intra-tract exposure values At each step of the aggregation, we look for the pair `(\beta, \delta)` of categories that has the highest exposure (renormalised by the maximum possible value). We aggregate them in a new category `\gamma` whose exposure with the other categories `\alpha` is given by .. math:: E_{\alpha, \gamma} = \frac{1}{N_\beta + N_\delta} \left( N_\beta E_{\alpha, \beta} + N_\delta E_{\alpha, \delta} \right) Parameters ---------- distribution: nested dictionaries Number of people per class, per areal unit as given in the raw data (ungrouped). The dictionary must have the following formatting: > {areal_id: {class_id: number}} exposure: nested dictionaries Matrix of exposures between categories. > {class_id0: {class_id1: (exposure_01, variance null model)}} Returns ------- linkage: list of tuples list L that encodes the hierarhical tree. At the ith iteration of the algorithm, L[i,0] and L[i,1] are aggregated to form the n+ith cluster. The exposure between L[i,1] and L[i,0] is given by L[i,3], the variance is given by L[i,4]. """ # # Data preparation # ## Linkage matrix linkage = [cl for cl in sorted(exposure, key=lambda x: int(x))] N = len(linkage) ## Get total categories = return_categories(distribution) N_unit, N_class, N_tot = compute_totals(distribution, categories) ## Use classes' position in the linkage matrix rather than names # Class totals for cl in categories: N_class[linkage.index(cl)] = N_class.pop(cl) #exposure E = { linkage.index(cl0): {linkage.index(cl1): exposure[cl0][cl1][0] for cl1 in exposure[cl0]} for cl0 in exposure } E_var = { linkage.index(cl0): {linkage.index(cl1): exposure[cl0][cl1][1] for cl1 in exposure[cl0]} for cl0 in exposure } # # Clustering # for i in range(N - 1): a, b = _find_friends(E, N_class) linkage.append((a, b, E[a][b], E_var[a][b])) E, E_var, N_class = _update_matrix(E, E_var, N_class, a, b) return linkage
def exposure(distribution, classes=None): """ Compute the exposure between classes The exposure between two categories `\alpha` and `\beta` is defined as ..math:: E_{\alpha \beta} = \frac{1}{N} \sum_{t=1}^{T} n(t) r_\alpha(t) r_\beta(t) where `r_\alpha(t)` is the representation of the class `\alpha` in the areal unit `t`, `n(t)` the total population of `t`, and `N` the total population in the considered system. The exposure of a class to itself `E_{\alpha \alpha}` measures the **isolation** of this class. The variance is computed on the null model which corresponds to the unsegregated configuration, that is when the spatial repartition of people of different income classes is no different from that that would be obtained if they scattered at random across the city. Parameters ---------- distribution: nested dictionaries Number of people per class, per areal unit as given in the raw data (ungrouped). The dictionary must have the following formatting: > {areal_id: {class_id: number}} classes: dictionary of lists When the original categories need to be aggregated into different classes. {class: [categories belonging to this class]} This can be arbitrarily imposed, or computed with uncover_classes function of this package. Returns ------- exposure: nested dictionaries Matrix of exposures between categories. > {class_id0: {class_id1: (exposure_01, variance null model)}} """ ## Regroup into classes if specified. if classes: distribution = regroup_per_class(distribution, classes) else: classes = return_categories(distribution) ## Compute the total numbers per class and per areal unit N_unit, N_class, N_tot = compute_totals(distribution, classes) ## Compute representation for all areal unit representation = mb.representation(distribution) ## Compute the exposure matrix # Only half of the values are computed (the matrix is symmetric) exposure = collections.defaultdict(dict) for alpha, beta in itertools.combinations_with_replacement(classes, 2): exposure[alpha][beta] = (pair_exposure(representation, N_unit, N_tot, alpha, beta), pair_variance(representation, N_unit, N_class, N_tot, alpha, beta)) # Symmetrize the output for c0 in exposure.iterkeys(): for c1 in exposure[c0].iterkeys(): if c0 not in exposure[c1]: exposure[c1][c0] = exposure[c0][c1] return exposure
def cluster_categories(distribution, exposure): """ Perform hierarhical clustering on the intra-tract exposure values At each step of the aggregation, we look for the pair `(\beta, \delta)` of categories that has the highest exposure (renormalised by the maximum possible value). We aggregate them in a new category `\gamma` whose exposure with the other categories `\alpha` is given by .. math:: E_{\alpha, \gamma} = \frac{1}{N_\beta + N_\delta} \left( N_\beta E_{\alpha, \beta} + N_\delta E_{\alpha, \delta} \right) Parameters ---------- distribution: nested dictionaries Number of people per class, per areal unit as given in the raw data (ungrouped). The dictionary must have the following formatting: > {areal_id: {class_id: number}} exposure: nested dictionaries Matrix of exposures between categories. > {class_id0: {class_id1: (exposure_01, variance null model)}} Returns ------- linkage: list of tuples list L that encodes the hierarhical tree. At the ith iteration of the algorithm, L[i,0] and L[i,1] are aggregated to form the n+ith cluster. The exposure between L[i,1] and L[i,0] is given by L[i,3], the variance is given by L[i,4]. """ # # Data preparation # ## Linkage matrix linkage = [cl for cl in sorted(exposure, key=lambda x: int(x))] N = len(linkage) ## Get total categories = return_categories(distribution) N_unit, N_class, N_tot = compute_totals(distribution, categories) ## Use classes' position in the linkage matrix rather than names # Class totals for cl in categories: N_class[linkage.index(cl)] = N_class.pop(cl) #exposure E = {linkage.index(cl0):{linkage.index(cl1):exposure[cl0][cl1][0] for cl1 in exposure[cl0]} for cl0 in exposure} E_var = {linkage.index(cl0):{linkage.index(cl1):exposure[cl0][cl1][1] for cl1 in exposure[cl0]} for cl0 in exposure} # # Clustering # for i in range(N-1): a, b = _find_friends(E, N_class) linkage.append((a, b, E[a][b], E_var[a][b])) E, E_var, N_class = _update_matrix(E, E_var, N_class, a, b) return linkage