Ejemplo n.º 1
0
def test_hill_diversity():
    # test hill diversity against scipy entropy
    for counts, probs in mock.mock_species_data():
        # check hill q=1 - this can be tested against scipy because hill q=1 is exponential of entropy
        assert np.allclose(diversity.hill_diversity(counts, q=1), np.exp(entropy(probs)), atol=0.001, rtol=0)
        # check that hill q<1 and q>1 is reasonably close to scipy entropy
        # (different internal computation)
        assert np.allclose(diversity.hill_diversity(counts, 0.99999999), np.exp(entropy(probs)), atol=0.001, rtol=0)
        assert np.allclose(diversity.hill_diversity(counts, 1.00000001), np.exp(entropy(probs)), atol=0.001, rtol=0)
        # check for malformed q
        with pytest.raises(ValueError):
            diversity.hill_diversity(counts, q=-1)
Ejemplo n.º 2
0
def test_hill_diversity_branch_distance_wt():
    # test against hill diversity by setting all weights = 1
    for counts, probs in mock.mock_species_data():

        non_weights = np.full(len(counts), 1)
        non_beta = -0
        for q in [0, 1, 2]:
            assert np.allclose(diversity.hill_diversity(counts, q),
                               diversity.hill_diversity_branch_distance_wt(counts, non_weights, q, non_beta),
                               atol=0.001, rtol=0)

        # check for malformed signatures
        with pytest.raises(ValueError):
            diversity.hill_diversity_branch_distance_wt(counts[:-1], non_weights, q=1, beta=-0.005)
        with pytest.raises(ValueError):
            diversity.hill_diversity_branch_distance_wt(counts, non_weights[:-1], q=1, beta=-0.005)
        with pytest.raises(ValueError):
            diversity.hill_diversity_branch_distance_wt(counts, non_weights, q=1, beta=0.005)
        with pytest.raises(ValueError):
            diversity.hill_diversity_branch_distance_wt(counts, non_weights, q=-1, beta=-0.005)
Ejemplo n.º 3
0
    if len(class_code_list) > max_elements:
        continue

    if len(class_code_list) % 100 == 0:
        print(f'List now at {len(class_code_list)}')

    class_code_arr = np.array(class_code_list)
    dist_arr = np.array(class_dist_list)
    classes_unique, classes_counts, classes_nearest = deduce_unique_species(
        class_code_arr, dist_arr, max_dist=1600)

    # iterate the betas and generate the mixed use metrics
    for k, beta in zip(data_keys, data_betas):
        # run the calculations
        data_5[k]['mu_hill_0'].append(
            diversity.hill_diversity(classes_counts, 0))
        data_5[k]['mu_hill_1'].append(
            diversity.hill_diversity(classes_counts, 1))
        data_5[k]['mu_hill_2'].append(
            diversity.hill_diversity(classes_counts, 2))

        data_5[k]['mu_hill_branch_wt_0'].append(
            diversity.hill_diversity_branch_distance_wt(classes_counts,
                                                        classes_nearest,
                                                        0,
                                                        beta=beta))
        data_5[k]['mu_hill_branch_wt_1'].append(
            diversity.hill_diversity_branch_distance_wt(classes_counts,
                                                        classes_nearest,
                                                        1,
                                                        beta=beta))
Ejemplo n.º 4
0
def local_aggregator(
    node_data: np.ndarray,
    edge_data: np.ndarray,
    node_edge_map: Dict,
    data_map: np.ndarray,
    distances: np.ndarray,
    betas: np.ndarray,
    landuse_encodings: np.ndarray = np.array([]),
    qs: np.ndarray = np.array([]),
    mixed_use_hill_keys: np.ndarray = np.array([]),
    mixed_use_other_keys: np.ndarray = np.array([]),
    accessibility_keys: np.ndarray = np.array([]),
    cl_disparity_wt_matrix: np.ndarray = np.array(np.full((0, 0), np.nan)),
    numerical_arrays: np.ndarray = np.array(np.full((0, 0), np.nan)),
    angular: bool = False,
    suppress_progress: bool = False
) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray,
           np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray,
           np.ndarray, np.ndarray]:
    '''
    NODE MAP:
    0 - x
    1 - y
    2 - live
    3 - ghosted
    EDGE MAP:
    0 - start node
    1 - end node
    2 - length in metres
    3 - sum of angular travel along length
    4 - impedance factor
    5 - in bearing
    6 - out bearing
    DATA MAP:
    0 - x
    1 - y
    2 - assigned network index - nearest
    3 - assigned network index - next-nearest
    '''
    checks.check_network_maps(node_data, edge_data, node_edge_map)
    checks.check_data_map(
        data_map, check_assigned=True
    )  # raises ValueError data points are not assigned to a network
    checks.check_distances_and_betas(distances, betas)

    # check landuse encodings
    compute_landuses = False
    if len(landuse_encodings) == 0:
        if len(mixed_use_hill_keys) != 0 or len(
                mixed_use_other_keys) != 0 or len(accessibility_keys) != 0:
            raise ValueError(
                'Mixed use metrics or land-use accessibilities require an array of landuse labels.'
            )
    elif len(landuse_encodings) != len(data_map):
        raise ValueError(
            'The number of landuse encodings does not match the number of data points.'
        )
    else:
        checks.check_categorical_data(landuse_encodings)

    # catch completely missing metrics
    if len(mixed_use_hill_keys) == 0 and len(
            mixed_use_other_keys) == 0 and len(accessibility_keys) == 0:
        if len(numerical_arrays) == 0:
            raise ValueError(
                'No metrics specified, please specify at least one metric to compute.'
            )
    else:
        compute_landuses = True

    # catch missing qs
    if len(mixed_use_hill_keys) != 0 and len(qs) == 0:
        raise ValueError(
            'Hill diversity measures require that at least one value of q is specified.'
        )

    # negative qs caught by hill diversity methods

    # check various problematic key combinations
    if len(mixed_use_hill_keys) != 0:
        if (mixed_use_hill_keys.min() < 0 or mixed_use_hill_keys.max() > 3):
            raise ValueError('Mixed-use "hill" keys out of range of 0:4.')

    if len(mixed_use_other_keys) != 0:
        if (mixed_use_other_keys.min() < 0 or mixed_use_other_keys.max() > 2):
            raise ValueError('Mixed-use "other" keys out of range of 0:3.')

    if len(accessibility_keys) != 0:
        max_ac_key = landuse_encodings.max()
        if (accessibility_keys.min() < 0
                or accessibility_keys.max() > max_ac_key):
            raise ValueError(
                'Negative or out of range accessibility key encountered. Keys must match class encodings.'
            )

    for i in range(len(mixed_use_hill_keys)):
        for j in range(len(mixed_use_hill_keys)):
            if j > i:
                i_key = mixed_use_hill_keys[i]
                j_key = mixed_use_hill_keys[j]
                if i_key == j_key:
                    raise ValueError('Duplicate mixed-use "hill" key.')

    for i in range(len(mixed_use_other_keys)):
        for j in range(len(mixed_use_other_keys)):
            if j > i:
                i_key = mixed_use_other_keys[i]
                j_key = mixed_use_other_keys[j]
                if i_key == j_key:
                    raise ValueError('Duplicate mixed-use "other" key.')

    for i in range(len(accessibility_keys)):
        for j in range(len(accessibility_keys)):
            if j > i:
                i_key = accessibility_keys[i]
                j_key = accessibility_keys[j]
                if i_key == j_key:
                    raise ValueError('Duplicate accessibility key.')

    def disp_check(disp_matrix):
        # the length of the disparity matrix vis-a-vis unique landuses is tested in underlying diversity functions
        if disp_matrix.ndim != 2 or disp_matrix.shape[0] != disp_matrix.shape[
                1]:
            raise ValueError(
                'The disparity matrix must be a square NxN matrix.')
        if len(disp_matrix) == 0:
            raise ValueError(
                'Hill disparity and Rao pairwise measures requires a class disparity weights matrix.'
            )

    # check that missing or malformed disparity weights matrices are caught
    for k in mixed_use_hill_keys:
        if k == 3:  # hill disparity
            disp_check(cl_disparity_wt_matrix)
    for k in mixed_use_other_keys:
        if k == 2:  # raos pairwise
            disp_check(cl_disparity_wt_matrix)

    compute_numerical = False
    # when passing an empty 2d array to numba, use: np.array(np.full((0, 0), np.nan))
    if len(numerical_arrays) != 0:
        compute_numerical = True
        if numerical_arrays.shape[1] != len(data_map):
            raise ValueError(
                'The length of the numerical data arrays do not match the length of the data map.'
            )
        checks.check_numerical_data(numerical_arrays)

    # establish variables
    netw_n = len(node_data)
    d_n = len(distances)
    q_n = len(qs)
    n_n = len(numerical_arrays)
    global_max_dist = distances.max()
    netw_nodes_live = node_data[:, 2]

    # setup data structures
    # hill mixed uses are structured separately to take values of q into account
    mixed_use_hill_data = np.full((4, q_n, d_n, netw_n), np.nan)  # 4 dim
    mixed_use_other_data = np.full((3, d_n, netw_n), np.nan)  # 3 dim

    accessibility_data = np.full((len(accessibility_keys), d_n, netw_n), 0.0)
    accessibility_data_wt = np.full((len(accessibility_keys), d_n, netw_n),
                                    0.0)

    # stats
    stats_sum = np.full((n_n, d_n, netw_n), np.nan)
    stats_sum_wt = np.full((n_n, d_n, netw_n), np.nan)

    stats_mean = np.full((n_n, d_n, netw_n), np.nan)
    stats_mean_wt = np.full((n_n, d_n, netw_n), np.nan)

    stats_count = np.full(
        (n_n, d_n, netw_n),
        np.nan)  # use np.nan instead of 0 to avoid division by zero issues
    stats_count_wt = np.full((n_n, d_n, netw_n), np.nan)

    stats_variance = np.full((n_n, d_n, netw_n), np.nan)
    stats_variance_wt = np.full((n_n, d_n, netw_n), np.nan)

    stats_max = np.full((n_n, d_n, netw_n), np.nan)
    stats_min = np.full((n_n, d_n, netw_n), np.nan)

    # iterate through each vert and aggregate
    steps = int(netw_n / 10000)
    for netw_src_idx in range(netw_n):
        if not suppress_progress:
            checks.progress_bar(netw_src_idx, netw_n, steps)
        # only compute for live nodes
        if not netw_nodes_live[netw_src_idx]:
            continue
        # generate the reachable classes and their respective distances
        # these are non-unique - i.e. simply the class of each data point within the maximum distance
        # the aggregate_to_src_idx method will choose the closer direction of approach to a data point
        # from the nearest or next-nearest network node (calculated once globally, prior to local_landuses method)
        reachable_data, reachable_data_dist, tree_preds = aggregate_to_src_idx(
            netw_src_idx, node_data, edge_data, node_edge_map, data_map,
            global_max_dist, angular)
        # LANDUSES
        if compute_landuses:
            mu_max_unique_cl = int(landuse_encodings.max() + 1)
            # counts of each class type (array length per max unique classes - not just those within max distance)
            classes_counts = np.full((d_n, mu_max_unique_cl), 0)
            # nearest of each class type (likewise)
            classes_nearest = np.full((d_n, mu_max_unique_cl), np.inf)
            # iterate the reachable indices and related distances
            for data_idx, (reachable, data_dist) in enumerate(
                    zip(reachable_data, reachable_data_dist)):
                if not reachable:
                    continue
                # get the class category in integer form
                # all class codes were encoded to sequential integers - these correspond to the array indices
                cl_code = int(landuse_encodings[int(data_idx)])
                # iterate the distance dimensions
                for d_idx, (d, b) in enumerate(zip(distances, betas)):
                    # increment class counts at respective distances if the distance is less than current d
                    if data_dist <= d:
                        classes_counts[d_idx, cl_code] += 1
                        # if distance is nearer, update the nearest distance array too
                        if data_dist < classes_nearest[d_idx, cl_code]:
                            classes_nearest[d_idx, cl_code] = data_dist
                        # if within distance, and if in accessibility keys, then aggregate accessibility too
                        for ac_idx, ac_code in enumerate(accessibility_keys):
                            if ac_code == cl_code:
                                accessibility_data[ac_idx, d_idx,
                                                   netw_src_idx] += 1
                                accessibility_data_wt[ac_idx, d_idx,
                                                      netw_src_idx] += np.exp(
                                                          b * data_dist)
                                # if a match was found, then no need to check others
                                break
            # mixed uses can be calculated now that the local class counts are aggregated
            # iterate the distances and betas
            for d_idx, b in enumerate(betas):
                cl_counts = classes_counts[d_idx]
                cl_nearest = classes_nearest[d_idx]
                # mu keys determine which metrics to compute
                # don't confuse with indices
                # previously used dynamic indices in data structures - but obtuse if irregularly ordered keys
                for mu_hill_key in mixed_use_hill_keys:
                    for q_idx, q_key in enumerate(qs):
                        if mu_hill_key == 0:
                            mixed_use_hill_data[0, q_idx, d_idx, netw_src_idx] = \
                                diversity.hill_diversity(cl_counts, q_key)
                        elif mu_hill_key == 1:
                            mixed_use_hill_data[1, q_idx, d_idx, netw_src_idx] = \
                                diversity.hill_diversity_branch_distance_wt(cl_counts, cl_nearest, q=q_key, beta=b)
                        elif mu_hill_key == 2:
                            mixed_use_hill_data[2, q_idx, d_idx, netw_src_idx] = \
                                diversity.hill_diversity_pairwise_distance_wt(cl_counts, cl_nearest, q=q_key, beta=b)
                        # land-use classification disparity hill diversity
                        # the wt matrix can be used without mapping because cl_counts is based on all classes
                        # regardless of whether they are reachable
                        elif mu_hill_key == 3:
                            mixed_use_hill_data[3, q_idx, d_idx, netw_src_idx] = \
                                diversity.hill_diversity_pairwise_matrix_wt(cl_counts,
                                                                            wt_matrix=cl_disparity_wt_matrix,
                                                                            q=q_key)
                for mu_other_key in mixed_use_other_keys:
                    if mu_other_key == 0:
                        mixed_use_other_data[0, d_idx, netw_src_idx] = \
                            diversity.shannon_diversity(cl_counts)
                    elif mu_other_key == 1:
                        mixed_use_other_data[1, d_idx, netw_src_idx] = \
                            diversity.gini_simpson_diversity(cl_counts)
                    elif mu_other_key == 2:
                        mixed_use_other_data[2, d_idx, netw_src_idx] = \
                            diversity.raos_quadratic_diversity(cl_counts, wt_matrix=cl_disparity_wt_matrix)
        # IDW
        # the order of the loops matters because the nested aggregations happen per distance per numerical array
        if compute_numerical:
            # iterate the reachable indices and related distances
            for data_idx, (reachable, data_dist) in enumerate(
                    zip(reachable_data, reachable_data_dist)):
                # some indices will be NaN if beyond max threshold distance - so check for infinity
                # this happens when within radial max distance, but beyond network max distance
                if not reachable:
                    continue
                # iterate the numerical arrays dimension
                for num_idx in range(n_n):
                    # some values will be NaN
                    num = numerical_arrays[num_idx, int(data_idx)]
                    if np.isnan(num):
                        continue
                    # iterate the distance dimensions
                    for d_idx, (d, b) in enumerate(zip(distances, betas)):
                        # increment mean aggregations at respective distances if the distance is less than current d
                        if data_dist <= d:
                            # aggregate
                            if np.isnan(stats_sum[num_idx, d_idx,
                                                  netw_src_idx]):
                                stats_sum[num_idx, d_idx, netw_src_idx] = num
                                stats_count[num_idx, d_idx, netw_src_idx] = 1
                                stats_sum_wt[num_idx, d_idx,
                                             netw_src_idx] = num * np.exp(
                                                 data_dist * b)
                                stats_count_wt[num_idx, d_idx,
                                               netw_src_idx] = np.exp(
                                                   data_dist * b)
                            else:
                                stats_sum[num_idx, d_idx, netw_src_idx] += num
                                stats_count[num_idx, d_idx, netw_src_idx] += 1
                                stats_sum_wt[num_idx, d_idx,
                                             netw_src_idx] += num * np.exp(
                                                 data_dist * b)
                                stats_count_wt[num_idx, d_idx,
                                               netw_src_idx] += np.exp(
                                                   data_dist * b)

                            if np.isnan(stats_max[num_idx, d_idx,
                                                  netw_src_idx]):
                                stats_max[num_idx, d_idx, netw_src_idx] = num
                            elif num > stats_max[num_idx, d_idx, netw_src_idx]:
                                stats_max[num_idx, d_idx, netw_src_idx] = num

                            if np.isnan(stats_min[num_idx, d_idx,
                                                  netw_src_idx]):
                                stats_min[num_idx, d_idx, netw_src_idx] = num
                            elif num < stats_min[num_idx, d_idx, netw_src_idx]:
                                stats_min[num_idx, d_idx, netw_src_idx] = num
            # finalise mean calculations - this is happening for a single netw_src_idx, so fairly fast
            for num_idx in range(n_n):
                for d_idx in range(d_n):
                    stats_mean[num_idx, d_idx, netw_src_idx] = \
                        stats_sum[num_idx, d_idx, netw_src_idx] / stats_count[num_idx, d_idx, netw_src_idx]
                    stats_mean_wt[num_idx, d_idx, netw_src_idx] = \
                        stats_sum_wt[num_idx, d_idx, netw_src_idx] / stats_count_wt[num_idx, d_idx, netw_src_idx]
            # calculate variances - counts are already computed per above
            # weighted version is IDW by division through equivalently weighted counts above
            # iterate the reachable indices and related distances
            for data_idx, (reachable, data_dist) in enumerate(
                    zip(reachable_data, reachable_data_dist)):
                # some indices will be NaN if beyond max threshold distance - so check for infinity
                # this happens when within radial max distance, but beyond network max distance
                if not reachable:
                    continue
                # iterate the numerical arrays dimension
                for num_idx in range(n_n):
                    # some values will be NaN
                    num = numerical_arrays[num_idx, int(data_idx)]
                    if np.isnan(num):
                        continue
                    # iterate the distance dimensions
                    for d_idx, (d, b) in enumerate(zip(distances, betas)):
                        # increment variance aggregations at respective distances if the distance is less than current d
                        if data_dist <= d:
                            # aggregate
                            if np.isnan(stats_variance[num_idx, d_idx,
                                                       netw_src_idx]):
                                stats_variance[num_idx, d_idx, netw_src_idx] = \
                                    np.square(num - stats_mean[num_idx, d_idx, netw_src_idx])
                                stats_variance_wt[num_idx, d_idx, netw_src_idx] = \
                                    np.square(num - stats_mean_wt[num_idx, d_idx, netw_src_idx]) * np.exp(data_dist * b)
                            else:
                                stats_variance[num_idx, d_idx, netw_src_idx] += \
                                    np.square(num - stats_mean[num_idx, d_idx, netw_src_idx])
                                stats_variance_wt[num_idx, d_idx, netw_src_idx] += \
                                    np.square(num - stats_mean_wt[num_idx, d_idx, netw_src_idx]) * np.exp(data_dist * b)
            # finalise variance calculations
            for num_idx in range(n_n):
                for d_idx in range(d_n):
                    stats_variance[num_idx, d_idx, netw_src_idx] = \
                        stats_variance[num_idx, d_idx, netw_src_idx] / stats_count[num_idx, d_idx, netw_src_idx]
                    stats_variance_wt[num_idx, d_idx, netw_src_idx] = \
                        stats_variance_wt[num_idx, d_idx, netw_src_idx] / stats_count_wt[num_idx, d_idx, netw_src_idx]
    # send the data back in the same types and same order as the original keys - convert to int for indexing
    mu_hill_k_int = np.full(len(mixed_use_hill_keys), 0)
    for i, k in enumerate(mixed_use_hill_keys):
        mu_hill_k_int[i] = k
    mu_other_k_int = np.full(len(mixed_use_other_keys), 0)
    for i, k in enumerate(mixed_use_other_keys):
        mu_other_k_int[i] = k

    return mixed_use_hill_data[mu_hill_k_int], \
           mixed_use_other_data[mu_other_k_int], \
           accessibility_data, accessibility_data_wt, \
           stats_sum, stats_sum_wt, \
           stats_mean, stats_mean_wt, \
           stats_variance, stats_variance_wt, \
           stats_max, stats_min
Ejemplo n.º 5
0
def test_aggregate_landuses_categorical_components(primal_graph):
    # generate node and edge maps
    node_uids, node_data, edge_data, node_edge_map, = graphs.graph_maps_from_nX(primal_graph)
    # setup data
    data_dict = mock.mock_data_dict(primal_graph, random_seed=13)
    data_uids, data_map = layers.data_map_from_dict(data_dict)
    data_map = data.assign_to_network(data_map, node_data, edge_data, node_edge_map, 500)
    # set parameters
    betas = np.array([0.02, 0.01, 0.005, 0.0025])
    distances = networks.distance_from_beta(betas)
    qs = np.array([0, 1, 2])
    mock_categorical = mock.mock_categorical_data(len(data_map))
    landuse_classes, landuse_encodings = layers.encode_categorical(mock_categorical)
    mock_matrix = np.full((len(landuse_classes), len(landuse_classes)), 1)
    # set the keys - add shuffling to be sure various orders work
    hill_keys = np.arange(4)
    np.random.shuffle(hill_keys)
    non_hill_keys = np.arange(3)
    np.random.shuffle(non_hill_keys)
    ac_keys = np.array([1, 2, 5])
    np.random.shuffle(ac_keys)
    # generate
    mu_data_hill, mu_data_other, ac_data, ac_data_wt = data.aggregate_landuses(node_data,
                                                                               edge_data,
                                                                               node_edge_map,
                                                                               data_map,
                                                                               distances,
                                                                               betas,
                                                                               landuse_encodings=landuse_encodings,
                                                                               qs=qs,
                                                                               mixed_use_hill_keys=hill_keys,
                                                                               mixed_use_other_keys=non_hill_keys,
                                                                               accessibility_keys=ac_keys,
                                                                               cl_disparity_wt_matrix=mock_matrix,
                                                                               angular=False)
    # hill
    hill = mu_data_hill[np.where(hill_keys == 0)][0]
    hill_branch_wt = mu_data_hill[np.where(hill_keys == 1)][0]
    hill_pw_wt = mu_data_hill[np.where(hill_keys == 2)][0]
    hill_disp_wt = mu_data_hill[np.where(hill_keys == 3)][0]
    # non hill
    shannon = mu_data_other[np.where(non_hill_keys == 0)][0]
    gini = mu_data_other[np.where(non_hill_keys == 1)][0]
    raos = mu_data_other[np.where(non_hill_keys == 2)][0]
    # access non-weighted
    ac_1_nw = ac_data[np.where(ac_keys == 1)][0]
    ac_2_nw = ac_data[np.where(ac_keys == 2)][0]
    ac_5_nw = ac_data[np.where(ac_keys == 5)][0]
    # access weighted
    ac_1_w = ac_data_wt[np.where(ac_keys == 1)][0]
    ac_2_w = ac_data_wt[np.where(ac_keys == 2)][0]
    ac_5_w = ac_data_wt[np.where(ac_keys == 5)][0]
    # test manual metrics against all nodes
    mu_max_unique = len(landuse_classes)
    # test against various distances
    for d_idx in range(len(distances)):
        dist_cutoff = distances[d_idx]
        beta = betas[d_idx]
        for src_idx in range(len(primal_graph)):
            reachable_data, reachable_data_dist, tree_preds = data.aggregate_to_src_idx(src_idx,
                                                                                        node_data,
                                                                                        edge_data,
                                                                                        node_edge_map,
                                                                                        data_map,
                                                                                        dist_cutoff)
            # counts of each class type (array length per max unique classes - not just those within max distance)
            cl_counts = np.full(mu_max_unique, 0)
            # nearest of each class type (likewise)
            cl_nearest = np.full(mu_max_unique, np.inf)
            # aggregate
            a_1_nw = 0
            a_2_nw = 0
            a_5_nw = 0
            a_1_w = 0
            a_2_w = 0
            a_5_w = 0
            # iterate reachable
            for data_idx, (reachable, data_dist) in enumerate(zip(reachable_data, reachable_data_dist)):
                if not reachable:
                    continue
                cl = landuse_encodings[data_idx]
                # double check distance is within threshold
                assert data_dist <= dist_cutoff
                # update the class counts
                cl_counts[cl] += 1
                # if distance is nearer, update the nearest distance array too
                if data_dist < cl_nearest[cl]:
                    cl_nearest[cl] = data_dist
                # aggregate accessibility codes
                if cl == 1:
                    a_1_nw += 1
                    a_1_w += np.exp(-beta * data_dist)
                elif cl == 2:
                    a_2_nw += 1
                    a_2_w += np.exp(-beta * data_dist)
                elif cl == 5:
                    a_5_nw += 1
                    a_5_w += np.exp(-beta * data_dist)
            # assertions
            assert ac_1_nw[d_idx, src_idx] == a_1_nw
            assert ac_2_nw[d_idx, src_idx] == a_2_nw
            assert ac_5_nw[d_idx, src_idx] == a_5_nw

            assert ac_1_w[d_idx, src_idx] == a_1_w
            assert ac_2_w[d_idx, src_idx] == a_2_w
            assert ac_5_w[d_idx, src_idx] == a_5_w

            assert hill[0, d_idx, src_idx] == diversity.hill_diversity(cl_counts, 0)
            assert hill[1, d_idx, src_idx] == diversity.hill_diversity(cl_counts, 1)
            assert hill[2, d_idx, src_idx] == diversity.hill_diversity(cl_counts, 2)

            assert hill_branch_wt[0, d_idx, src_idx] == \
                   diversity.hill_diversity_branch_distance_wt(cl_counts, cl_nearest, 0, beta)
            assert hill_branch_wt[1, d_idx, src_idx] == \
                   diversity.hill_diversity_branch_distance_wt(cl_counts, cl_nearest, 1, beta)
            assert hill_branch_wt[2, d_idx, src_idx] == \
                   diversity.hill_diversity_branch_distance_wt(cl_counts, cl_nearest, 2, beta)

            assert hill_pw_wt[0, d_idx, src_idx] == \
                   diversity.hill_diversity_pairwise_distance_wt(cl_counts, cl_nearest, 0, beta)
            assert hill_pw_wt[1, d_idx, src_idx] == \
                   diversity.hill_diversity_pairwise_distance_wt(cl_counts, cl_nearest, 1, beta)
            assert hill_pw_wt[2, d_idx, src_idx] == \
                   diversity.hill_diversity_pairwise_distance_wt(cl_counts, cl_nearest, 2, beta)

            assert hill_disp_wt[0, d_idx, src_idx] == \
                   diversity.hill_diversity_pairwise_matrix_wt(cl_counts, mock_matrix, 0)
            assert hill_disp_wt[1, d_idx, src_idx] == \
                   diversity.hill_diversity_pairwise_matrix_wt(cl_counts, mock_matrix, 1)
            assert hill_disp_wt[2, d_idx, src_idx] == \
                   diversity.hill_diversity_pairwise_matrix_wt(cl_counts, mock_matrix, 2)

            assert shannon[d_idx, src_idx] == diversity.shannon_diversity(cl_counts)
            assert gini[d_idx, src_idx] == diversity.gini_simpson_diversity(cl_counts)
            assert raos[d_idx, src_idx] == diversity.raos_quadratic_diversity(cl_counts, mock_matrix)

    # check that angular is passed-through
    # actual angular tests happen in test_shortest_path_tree()
    # here the emphasis is simply on checking that the angular instruction gets chained through

    # setup dual data
    G_dual = graphs.nX_to_dual(primal_graph)
    node_labels_dual, node_data_dual, edge_data_dual, node_edge_map_dual = graphs.graph_maps_from_nX(G_dual)
    data_dict_dual = mock.mock_data_dict(G_dual, random_seed=13)
    data_uids_dual, data_map_dual = layers.data_map_from_dict(data_dict_dual)
    data_map_dual = data.assign_to_network(data_map_dual, node_data_dual, edge_data_dual, node_edge_map_dual, 500)
    mock_categorical = mock.mock_categorical_data(len(data_map_dual))
    landuse_classes_dual, landuse_encodings_dual = layers.encode_categorical(mock_categorical)
    mock_matrix = np.full((len(landuse_classes_dual), len(landuse_classes_dual)), 1)

    mu_hill_dual, mu_other_dual, ac_dual, ac_wt_dual = data.aggregate_landuses(node_data_dual,
                                                                               edge_data_dual,
                                                                               node_edge_map_dual,
                                                                               data_map_dual,
                                                                               distances,
                                                                               betas,
                                                                               landuse_encodings_dual,
                                                                               qs=qs,
                                                                               mixed_use_hill_keys=hill_keys,
                                                                               mixed_use_other_keys=non_hill_keys,
                                                                               accessibility_keys=ac_keys,
                                                                               cl_disparity_wt_matrix=mock_matrix,
                                                                               angular=True)

    mu_hill_dual_sidestep, mu_other_dual_sidestep, ac_dual_sidestep, ac_wt_dual_sidestep = \
        data.aggregate_landuses(node_data_dual,
                                edge_data_dual,
                                node_edge_map_dual,
                                data_map_dual,
                                distances,
                                betas,
                                landuse_encodings_dual,
                                qs=qs,
                                mixed_use_hill_keys=hill_keys,
                                mixed_use_other_keys=non_hill_keys,
                                accessibility_keys=ac_keys,
                                cl_disparity_wt_matrix=mock_matrix,
                                angular=False)

    assert not np.allclose(mu_hill_dual, mu_hill_dual_sidestep, atol=0.001, rtol=0)
    assert not np.allclose(mu_other_dual, mu_other_dual_sidestep, atol=0.001, rtol=0)
    assert not np.allclose(ac_dual, ac_dual_sidestep, atol=0.001, rtol=0)
    assert not np.allclose(ac_wt_dual, ac_wt_dual_sidestep, atol=0.001, rtol=0)
Ejemplo n.º 6
0
def aggregate_landuses(
    node_data: np.ndarray,
    edge_data: np.ndarray,
    node_edge_map: Dict,
    data_map: np.ndarray,
    distances: np.ndarray,
    betas: np.ndarray,
    landuse_encodings: np.ndarray = np.array([]),
    qs: np.ndarray = np.array([]),
    mixed_use_hill_keys: np.ndarray = np.array([]),
    mixed_use_other_keys: np.ndarray = np.array([]),
    accessibility_keys: np.ndarray = np.array([]),
    cl_disparity_wt_matrix: np.ndarray = np.array(np.full((0, 0), np.nan)),
    jitter_scale: float = 0.0,
    angular: bool = False,
    progress_proxy=None
) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
    """
    NODE MAP:
    0 - x
    1 - y
    2 - live
    EDGE MAP:
    0 - start node
    1 - end node
    2 - length in metres
    3 - sum of angular travel along length
    4 - impedance factor
    5 - in bearing
    6 - out bearing
    DATA MAP:
    0 - x
    1 - y
    2 - assigned network index - nearest
    3 - assigned network index - next-nearest
    """
    checks.check_network_maps(node_data, edge_data, node_edge_map)
    checks.check_data_map(
        data_map, check_assigned=True
    )  # raises ValueError data points are not assigned to a network
    checks.check_distances_and_betas(distances, betas)
    # check landuse encodings
    if len(landuse_encodings) == 0:
        raise ValueError(
            'Mixed use metrics or land-use accessibilities require an array of landuse labels.'
        )
    elif len(landuse_encodings) != len(data_map):
        raise ValueError(
            'The number of landuse encodings does not match the number of data points.'
        )
    else:
        checks.check_categorical_data(landuse_encodings)
    # catch completely missing metrics
    if len(mixed_use_hill_keys) == 0 and len(
            mixed_use_other_keys) == 0 and len(accessibility_keys) == 0:
        raise ValueError(
            'No metrics specified, please specify at least one metric to compute.'
        )
    # catch missing qs
    if len(mixed_use_hill_keys) != 0 and len(qs) == 0:
        raise ValueError(
            'Hill diversity measures require that at least one value of q is specified.'
        )
    # negative qs caught by hill diversity methods
    # check various problematic key combinations
    if len(mixed_use_hill_keys) != 0:
        if np.nanmin(mixed_use_hill_keys) < 0 or np.max(
                mixed_use_hill_keys) > 3:
            raise ValueError('Mixed-use "hill" keys out of range of 0:4.')
    if len(mixed_use_other_keys) != 0:
        if np.nanmin(mixed_use_other_keys) < 0 or np.max(
                mixed_use_other_keys) > 2:
            raise ValueError('Mixed-use "other" keys out of range of 0:3.')
    if len(accessibility_keys) != 0:
        max_ac_key = np.nanmax(landuse_encodings)
        if np.nanmin(accessibility_keys) < 0 or np.max(
                accessibility_keys) > max_ac_key:
            raise ValueError(
                'Negative or out of range accessibility key encountered. Keys must match class encodings.'
            )
    for i in range(len(mixed_use_hill_keys)):
        for j in range(len(mixed_use_hill_keys)):
            if j > i:
                i_key = mixed_use_hill_keys[i]
                j_key = mixed_use_hill_keys[j]
                if i_key == j_key:
                    raise ValueError('Duplicate mixed-use "hill" key.')
    for i in range(len(mixed_use_other_keys)):
        for j in range(len(mixed_use_other_keys)):
            if j > i:
                i_key = mixed_use_other_keys[i]
                j_key = mixed_use_other_keys[j]
                if i_key == j_key:
                    raise ValueError('Duplicate mixed-use "other" key.')
    for i in range(len(accessibility_keys)):
        for j in range(len(accessibility_keys)):
            if j > i:
                i_key = accessibility_keys[i]
                j_key = accessibility_keys[j]
                if i_key == j_key:
                    raise ValueError('Duplicate accessibility key.')

    def disp_check(disp_matrix):
        # the length of the disparity matrix vis-a-vis unique landuses is tested in underlying diversity functions
        if disp_matrix.ndim != 2 or disp_matrix.shape[0] != disp_matrix.shape[
                1]:
            raise ValueError(
                'The disparity matrix must be a square NxN matrix.')
        if len(disp_matrix) == 0:
            raise ValueError(
                'Hill disparity and Rao pairwise measures requires a class disparity weights matrix.'
            )

    # check that missing or malformed disparity weights matrices are caught
    for k in mixed_use_hill_keys:
        if k == 3:  # hill disparity
            disp_check(cl_disparity_wt_matrix)
    for k in mixed_use_other_keys:
        if k == 2:  # raos pairwise
            disp_check(cl_disparity_wt_matrix)
    # establish variables
    netw_n = len(node_data)
    d_n = len(distances)
    q_n = len(qs)
    global_max_dist = float(np.nanmax(distances))
    netw_nodes_live = node_data[:, 2]
    # setup data structures
    # hill mixed uses are structured separately to take values of q into account
    mixed_use_hill_data = np.full((4, q_n, d_n, netw_n), 0.0)  # 4 dim
    mixed_use_other_data = np.full((3, d_n, netw_n), 0.0)  # 3 dim
    accessibility_data = np.full((len(accessibility_keys), d_n, netw_n), 0.0)
    accessibility_data_wt = np.full((len(accessibility_keys), d_n, netw_n),
                                    0.0)
    # iterate through each vert and aggregate
    # parallelise over n nodes:
    # each distance or stat array index is therefore only touched by one thread at a time
    # i.e. no need to use inner array deductions as with centralities
    for netw_src_idx in prange(netw_n):
        if progress_proxy is not None:
            progress_proxy.update(1)
        # only compute for live nodes
        if not netw_nodes_live[netw_src_idx]:
            continue
        # generate the reachable classes and their respective distances
        # these are non-unique - i.e. simply the class of each data point within the maximum distance
        # the aggregate_to_src_idx method will choose the closer direction of approach to a data point
        # from the nearest or next-nearest network node (calculated once globally, prior to local_landuses method)
        reachable_data, reachable_data_dist, tree_preds = aggregate_to_src_idx(
            netw_src_idx,
            node_data,
            edge_data,
            node_edge_map,
            data_map,
            global_max_dist,
            jitter_scale=jitter_scale,
            angular=angular)
        # LANDUSES
        mu_max_unique_cl = int(landuse_encodings.max() + 1)
        # counts of each class type (array length per max unique classes - not just those within max distance)
        classes_counts = np.full((d_n, mu_max_unique_cl), 0)
        # nearest of each class type (likewise)
        classes_nearest = np.full((d_n, mu_max_unique_cl), np.inf)
        # iterate the reachable indices and related distances
        for data_idx, (reachable, data_dist) in enumerate(
                zip(reachable_data, reachable_data_dist)):
            if not reachable:
                continue
            # get the class category in integer form
            # all class codes were encoded to sequential integers - these correspond to the array indices
            cl_code = int(landuse_encodings[int(data_idx)])
            # iterate the distance dimensions
            for d_idx, (d, b) in enumerate(zip(distances, betas)):
                # increment class counts at respective distances if the distance is less than current d
                if data_dist <= d:
                    classes_counts[d_idx, cl_code] += 1
                    # if distance is nearer, update the nearest distance array too
                    if data_dist < classes_nearest[d_idx, cl_code]:
                        classes_nearest[d_idx, cl_code] = data_dist
                    # if within distance, and if in accessibility keys, then aggregate accessibility too
                    for ac_idx, ac_code in enumerate(accessibility_keys):
                        if ac_code == cl_code:
                            accessibility_data[ac_idx, d_idx,
                                               netw_src_idx] += 1
                            accessibility_data_wt[ac_idx, d_idx,
                                                  netw_src_idx] += np.exp(
                                                      -b * data_dist)
                            # if a match was found, then no need to check others
                            break
        # mixed uses can be calculated now that the local class counts are aggregated
        # iterate the distances and betas
        for d_idx, b in enumerate(betas):
            cl_counts = classes_counts[d_idx]
            cl_nearest = classes_nearest[d_idx]
            # mu keys determine which metrics to compute
            # don't confuse with indices
            # previously used dynamic indices in data structures - but obtuse if irregularly ordered keys
            for mu_hill_key in mixed_use_hill_keys:
                for q_idx, q_key in enumerate(qs):
                    if mu_hill_key == 0:
                        mixed_use_hill_data[0, q_idx, d_idx, netw_src_idx] = \
                            diversity.hill_diversity(cl_counts, q_key)
                    elif mu_hill_key == 1:
                        mixed_use_hill_data[1, q_idx, d_idx, netw_src_idx] = \
                            diversity.hill_diversity_branch_distance_wt(cl_counts, cl_nearest, q=q_key, beta=b)
                    elif mu_hill_key == 2:
                        mixed_use_hill_data[2, q_idx, d_idx, netw_src_idx] = \
                            diversity.hill_diversity_pairwise_distance_wt(cl_counts, cl_nearest, q=q_key, beta=b)
                    # land-use classification disparity hill diversity
                    # the wt matrix can be used without mapping because cl_counts is based on all classes
                    # regardless of whether they are reachable
                    elif mu_hill_key == 3:
                        mixed_use_hill_data[3, q_idx, d_idx, netw_src_idx] = \
                            diversity.hill_diversity_pairwise_matrix_wt(cl_counts,
                                                                        wt_matrix=cl_disparity_wt_matrix,
                                                                        q=q_key)
            for mu_other_key in mixed_use_other_keys:
                if mu_other_key == 0:
                    mixed_use_other_data[0, d_idx, netw_src_idx] = \
                        diversity.shannon_diversity(cl_counts)
                elif mu_other_key == 1:
                    mixed_use_other_data[1, d_idx, netw_src_idx] = \
                        diversity.gini_simpson_diversity(cl_counts)
                elif mu_other_key == 2:
                    mixed_use_other_data[2, d_idx, netw_src_idx] = \
                        diversity.raos_quadratic_diversity(cl_counts, wt_matrix=cl_disparity_wt_matrix)
    # send the data back in the same types and same order as the original keys - convert to int for indexing
    mu_hill_k_int = np.full(len(mixed_use_hill_keys), 0)
    for i, k in enumerate(mixed_use_hill_keys):
        mu_hill_k_int[i] = k
    mu_other_k_int = np.full(len(mixed_use_other_keys), 0)
    for i, k in enumerate(mixed_use_other_keys):
        mu_other_k_int[i] = k

    return mixed_use_hill_data[mu_hill_k_int], \
           mixed_use_other_data[mu_other_k_int], \
           accessibility_data, \
           accessibility_data_wt