Beispiel #1
0
def make_angular_hyperplane(data, indices, rng_state):
    left_index = tau_rand_int(rng_state) % indices.shape[0]
    right_index = tau_rand_int(rng_state) % indices.shape[0]
    right_index += left_index == right_index
    right_index = right_index % indices.shape[0]
    left = indices[left_index]
    right = indices[right_index]

    left_norm = norm(data[left])
    right_norm = norm(data[right])

    if left_norm == 0.0:
        left_norm = 1.0

    if right_norm == 0.0:
        right_norm = 1.0

    # Compute the normal vector to the hyperplane (the vector between
    # the two points) and the offset from the origin
    hyperplane_offset = 0.0
    hyperplane_vector = np.empty(data.shape[1], dtype=np.float32)

    for d in range(data.shape[1]):
        hyperplane_vector[d] = (data[left, d] / left_norm) - (data[right, d] /
                                                              right_norm)

    return hyperplane_vector, hyperplane_offset
Beispiel #2
0
def sparse_select_side(hyperplane, offset, point_inds, point_data, rng_state):
    margin = offset

    hyperplane_size = hyperplane.shape[1]
    while hyperplane[0, hyperplane_size - 1] < 0.0:
        hyperplane_size -= 1

    hyperplane_inds = hyperplane[0, :hyperplane_size].astype(np.int32)
    hyperplane_data = hyperplane[1, :hyperplane_size]

    _, aux_data = sparse_mul(hyperplane_inds, hyperplane_data, point_inds,
                             point_data)

    for val in aux_data:
        margin += val

    if abs(margin) < EPS:
        side = tau_rand_int(rng_state) % 2
        if side == 0:
            return 0
        else:
            return 1
    elif margin > 0:
        return 0
    else:
        return 1
Beispiel #3
0
def sparse_current_graph_map_jit(
    heap,
    rows,
    n_neighbors,
    inds,
    indptr,
    data,
    rng_state,
    seed_per_row,
    sparse_dist,
):
    rng_state_local = rng_state.copy()
    for i in rows:
        if seed_per_row:
            seed(rng_state_local, i)
        if heap[0, i, 0] < 0.0:
            for j in range(n_neighbors - np.sum(heap[0, i] >= 0.0)):
                idx = np.abs(tau_rand_int(rng_state_local)) % data.shape[0]

                from_inds = inds[indptr[i]:indptr[i + 1]]
                from_data = data[indptr[i]:indptr[i + 1]]

                to_inds = inds[indptr[idx]:indptr[idx + 1]]
                to_data = data[indptr[idx]:indptr[idx + 1]]

                d = sparse_dist(from_inds, from_data, to_inds, to_data)

                heap_push(heap, i, d, idx, 1)

    return True
Beispiel #4
0
def apply_hyperplane(
    data,
    hyperplane_vector,
    hyperplane_offset,
    hyperplane_node_num,
    current_num_nodes,
    data_node_loc,
    rng_state,
):

    left_node = current_num_nodes
    right_node = current_num_nodes + 1

    for i in range(data_node_loc.shape[0]):
        if data_node_loc[i] != hyperplane_node_num:
            continue

        margin = hyperplane_offset
        for d in range(hyperplane_vector.shape[0]):
            margin += hyperplane_vector[d] * data[i, d]

        if margin == 0:
            if abs(tau_rand_int(rng_state)) % 2 == 0:
                data_node_loc[i] = left_node
            else:
                data_node_loc[i] = right_node
        elif margin > 0:
            data_node_loc[i] = left_node
        else:
            data_node_loc[i] = right_node

    return
Beispiel #5
0
def search_init(
    query_inds,
    query_data,
    k,
    inds,
    indptr,
    data,
    forest,
    n_neighbors,
    tried,
    sparse_dist,
    rng_state,
):

    heap_priorities = np.float32(np.inf) + np.zeros(k, dtype=np.float32)
    heap_indices = np.int32(-1) + np.zeros(k, dtype=np.int32)
    n_samples = indptr.shape[0] - 1

    n_random_samples = min(k, n_neighbors)

    for tree in forest:
        indices = search_sparse_flat_tree(
            query_inds,
            query_data,
            tree.hyperplanes,
            tree.offsets,
            tree.children,
            tree.indices,
            rng_state,
        )

        n_initial_points = indices.shape[0]
        n_random_samples = min(k, n_neighbors) - n_initial_points

        for j in range(n_initial_points):
            candidate = indices[j]

            from_inds = inds[indptr[candidate] : indptr[candidate + 1]]
            from_data = data[indptr[candidate] : indptr[candidate + 1]]

            d = sparse_dist(from_inds, from_data, query_inds, query_data)

            # indices are guaranteed different
            simple_heap_push(heap_priorities, heap_indices, d, candidate)
            mark_visited(tried, candidate)

    if n_random_samples > 0:
        for i in range(n_random_samples):
            candidate = np.abs(tau_rand_int(rng_state)) % n_samples
            if has_been_visited(tried, candidate) == 0:
                from_inds = inds[indptr[candidate] : indptr[candidate + 1]]
                from_data = data[indptr[candidate] : indptr[candidate + 1]]

                d = sparse_dist(from_inds, from_data, query_inds, query_data,)

                simple_heap_push(heap_priorities, heap_indices, d, candidate)
                mark_visited(tried, candidate)

    return heap_priorities, heap_indices
Beispiel #6
0
def make_euclidean_hyperplane(data, indices, rng_state):
    left_index = tau_rand_int(rng_state) % indices.shape[0]
    right_index = tau_rand_int(rng_state) % indices.shape[0]
    right_index += left_index == right_index
    right_index = right_index % indices.shape[0]
    left = indices[left_index]
    right = indices[right_index]

    # Compute the normal vector to the hyperplane (the vector between
    # the two points) and the offset from the origin
    hyperplane_offset = 0.0
    hyperplane_vector = np.empty(data.shape[1], dtype=np.float32)

    for d in range(data.shape[1]):
        hyperplane_vector[d] = data[left, d] - data[right, d]
        hyperplane_offset -= (hyperplane_vector[d] *
                              (data[left, d] + data[right, d]) / 2.0)

    return hyperplane_vector, hyperplane_offset
Beispiel #7
0
def current_graph_map_jit(heap, rows, n_neighbors, data, rng_state,
                          seed_per_row, dist, dist_args):
    rng_state_local = rng_state.copy()
    for i in rows:
        if seed_per_row:
            seed(rng_state_local, i)
        if heap[0, i, 0] < 0.0:
            for j in range(n_neighbors - np.sum(heap[0, i] >= 0.0)):
                idx = np.abs(tau_rand_int(rng_state_local)) % data.shape[0]
                d = dist(data[i], data[idx], *dist_args)
                heap_push(heap, i, d, idx, 1)

    return True
Beispiel #8
0
def select_side(hyperplane, offset, point, rng_state):
    margin = offset
    for d in range(point.shape[0]):
        margin += hyperplane[d] * point[d]

    if abs(margin) < EPS:
        side = tau_rand_int(rng_state) % 2
        if side == 0:
            return 0
        else:
            return 1
    elif margin > 0:
        return 0
    else:
        return 1
Beispiel #9
0
def init_random(n_neighbors, inds, indptr, data, heap, dist, rng_state):
    n_samples = indptr.shape[0] - 1
    for i in range(n_samples):
        if heap[0][i, 0] < 0.0:
            for j in range(n_neighbors - np.sum(heap[0][i] >= 0.0)):
                idx = np.abs(tau_rand_int(rng_state)) % n_samples

                from_inds = inds[indptr[idx] : indptr[idx + 1]]
                from_data = data[indptr[idx] : indptr[idx + 1]]

                to_inds = inds[indptr[i] : indptr[i + 1]]
                to_data = data[indptr[i] : indptr[i + 1]]
                d = dist(from_inds, from_data, to_inds, to_data)

                heap_push(heap, i, d, idx, 1)

    return
Beispiel #10
0
def sparse_select_side(hyperplane, offset, point_inds, point_data, rng_state):
    margin = offset

    hyperplane_inds = arr_unique(hyperplane[0])
    hyperplane_data = hyperplane[1, : hyperplane_inds.shape[0]]

    _, aux_data = sparse_mul(hyperplane_inds, hyperplane_data, point_inds, point_data)

    for d in range(aux_data.shape[0]):
        margin += aux_data[d]

    if abs(margin) < EPS:
        side = tau_rand_int(rng_state) % 2
        if side == 0:
            return 0
        else:
            return 1
    elif margin > 0:
        return 0
    else:
        return 1
Beispiel #11
0
def angular_random_projection_split(data, indices, rng_state):
    """Given a set of ``graph_indices`` for graph_data points from ``graph_data``, create
    a random hyperplane to split the graph_data, returning two arrays graph_indices
    that fall on either side of the hyperplane. This is the basis for a
    random projection tree, which simply uses this splitting recursively.
    This particular split uses cosine distance to determine the hyperplane
    and which side each graph_data sample falls on.
    Parameters
    ----------
    data: array of shape (n_samples, n_features)
        The original graph_data to be split
    indices: array of shape (tree_node_size,)
        The graph_indices of the elements in the ``graph_data`` array that are to
        be split in the current operation.
    rng_state: array of int64, shape (3,)
        The internal state of the rng
    Returns
    -------
    indices_left: array
        The elements of ``graph_indices`` that fall on the "left" side of the
        random hyperplane.
    indices_right: array
        The elements of ``graph_indices`` that fall on the "left" side of the
        random hyperplane.
    """
    dim = data.shape[1]

    # Select two random points, set the hyperplane between them
    left_index = tau_rand_int(rng_state) % indices.shape[0]
    right_index = tau_rand_int(rng_state) % indices.shape[0]
    right_index += left_index == right_index
    right_index = right_index % indices.shape[0]
    left = indices[left_index]
    right = indices[right_index]

    left_norm = norm(data[left])
    right_norm = norm(data[right])

    if abs(left_norm) < EPS:
        left_norm = 1.0

    if abs(right_norm) < EPS:
        right_norm = 1.0

    # Compute the normal vector to the hyperplane (the vector between
    # the two points)
    hyperplane_vector = np.empty(dim, dtype=np.float32)

    for d in range(dim):
        hyperplane_vector[d] = (data[left, d] / left_norm) - (data[right, d] /
                                                              right_norm)

    hyperplane_norm = norm(hyperplane_vector)
    if abs(hyperplane_norm) < EPS:
        hyperplane_norm = 1.0

    for d in range(dim):
        hyperplane_vector[d] = hyperplane_vector[d] / hyperplane_norm

    # For each point compute the margin (project into normal vector)
    # If we are on lower side of the hyperplane put in one pile, otherwise
    # put it in the other pile (if we hit hyperplane on the nose, flip a coin)
    n_left = 0
    n_right = 0
    side = np.empty(indices.shape[0], np.int8)
    for i in range(indices.shape[0]):
        margin = 0.0
        for d in range(dim):
            margin += hyperplane_vector[d] * data[indices[i], d]

        if abs(margin) < EPS:
            side[i] = tau_rand_int(rng_state) % 2
            if side[i] == 0:
                n_left += 1
            else:
                n_right += 1
        elif margin > 0:
            side[i] = 0
            n_left += 1
        else:
            side[i] = 1
            n_right += 1

    # Now that we have the counts allocate arrays
    indices_left = np.empty(n_left, dtype=np.int32)
    indices_right = np.empty(n_right, dtype=np.int32)

    # Populate the arrays with graph_indices according to which side they fell on
    n_left = 0
    n_right = 0
    for i in range(side.shape[0]):
        if side[i] == 0:
            indices_left[n_left] = indices[i]
            n_left += 1
        else:
            indices_right[n_right] = indices[i]
            n_right += 1

    return indices_left, indices_right, hyperplane_vector, 0.0
Beispiel #12
0
def sparse_euclidean_random_projection_split(inds, indptr, data, indices,
                                             rng_state):
    """Given a set of ``graph_indices`` for graph_data points from a sparse graph_data set
    presented in csr sparse format as inds, graph_indptr and graph_data, create
    a random hyperplane to split the graph_data, returning two arrays graph_indices
    that fall on either side of the hyperplane. This is the basis for a
    random projection tree, which simply uses this splitting recursively.
    This particular split uses cosine distance to determine the hyperplane
    and which side each graph_data sample falls on.
    Parameters
    ----------
    inds: array
        CSR format index array of the matrix
    indptr: array
        CSR format index pointer array of the matrix
    data: array
        CSR format graph_data array of the matrix
    indices: array of shape (tree_node_size,)
        The graph_indices of the elements in the ``graph_data`` array that are to
        be split in the current operation.
    rng_state: array of int64, shape (3,)
        The internal state of the rng
    Returns
    -------
    indices_left: array
        The elements of ``graph_indices`` that fall on the "left" side of the
        random hyperplane.
    indices_right: array
        The elements of ``graph_indices`` that fall on the "left" side of the
        random hyperplane.
    """
    # Select two random points, set the hyperplane between them
    left_index = np.abs(tau_rand_int(rng_state)) % indices.shape[0]
    right_index = np.abs(tau_rand_int(rng_state)) % indices.shape[0]
    right_index += left_index == right_index
    right_index = right_index % indices.shape[0]
    left = indices[left_index]
    right = indices[right_index]

    left_inds = inds[indptr[left]:indptr[left + 1]]
    left_data = data[indptr[left]:indptr[left + 1]]
    right_inds = inds[indptr[right]:indptr[right + 1]]
    right_data = data[indptr[right]:indptr[right + 1]]

    # Compute the normal vector to the hyperplane (the vector between
    # the two points) and the offset from the origin
    hyperplane_offset = 0.0
    hyperplane_inds, hyperplane_data = sparse_diff(left_inds, left_data,
                                                   right_inds, right_data)
    offset_inds, offset_data = sparse_sum(left_inds, left_data, right_inds,
                                          right_data)
    offset_data = offset_data / 2.0
    offset_inds, offset_data = sparse_mul(hyperplane_inds, hyperplane_data,
                                          offset_inds,
                                          offset_data.astype(np.float32))

    for val in offset_data:
        hyperplane_offset -= val

    # For each point compute the margin (project into normal vector, add offset)
    # If we are on lower side of the hyperplane put in one pile, otherwise
    # put it in the other pile (if we hit hyperplane on the nose, flip a coin)
    n_left = 0
    n_right = 0
    side = np.empty(indices.shape[0], np.int8)
    for i in range(indices.shape[0]):
        margin = hyperplane_offset
        i_inds = inds[indptr[indices[i]]:indptr[indices[i] + 1]]
        i_data = data[indptr[indices[i]]:indptr[indices[i] + 1]]

        _, mul_data = sparse_mul(hyperplane_inds, hyperplane_data, i_inds,
                                 i_data)
        for val in mul_data:
            margin += val

        if abs(margin) < EPS:
            side[i] = abs(tau_rand_int(rng_state)) % 2
            if side[i] == 0:
                n_left += 1
            else:
                n_right += 1
        elif margin > 0:
            side[i] = 0
            n_left += 1
        else:
            side[i] = 1
            n_right += 1

    # Now that we have the counts allocate arrays
    indices_left = np.empty(n_left, dtype=np.int32)
    indices_right = np.empty(n_right, dtype=np.int32)

    # Populate the arrays with graph_indices according to which side they fell on
    n_left = 0
    n_right = 0
    for i in range(side.shape[0]):
        if side[i] == 0:
            indices_left[n_left] = indices[i]
            n_left += 1
        else:
            indices_right[n_right] = indices[i]
            n_right += 1

    hyperplane = np.vstack((hyperplane_inds, hyperplane_data))

    return indices_left, indices_right, hyperplane, hyperplane_offset
Beispiel #13
0
def sparse_angular_random_projection_split(inds, indptr, data, indices,
                                           rng_state):
    """Given a set of ``indices`` for data points from a sparse data set
    presented in csr sparse format as inds, indptr and data, create
    a random hyperplane to split the data, returning two arrays indices
    that fall on either side of the hyperplane. This is the basis for a
    random projection tree, which simply uses this splitting recursively.
    This particular split uses cosine distance to determine the hyperplane
    and which side each data sample falls on.
    Parameters
    ----------
    inds: array
        CSR format index array of the matrix
    indptr: array
        CSR format index pointer array of the matrix
    data: array
        CSR format data array of the matrix
    indices: array of shape (tree_node_size,)
        The indices of the elements in the ``data`` array that are to
        be split in the current operation.
    rng_state: array of int64, shape (3,)
        The internal state of the rng
    Returns
    -------
    indices_left: array
        The elements of ``indices`` that fall on the "left" side of the
        random hyperplane.
    indices_right: array
        The elements of ``indices`` that fall on the "left" side of the
        random hyperplane.
    """
    # Select two random points, set the hyperplane between them
    left_index = tau_rand_int(rng_state) % indices.shape[0]
    right_index = tau_rand_int(rng_state) % indices.shape[0]
    right_index += left_index == right_index
    right_index = right_index % indices.shape[0]
    left = indices[left_index]
    right = indices[right_index]

    left_inds = inds[indptr[left]:indptr[left + 1]]
    left_data = data[indptr[left]:indptr[left + 1]]
    right_inds = inds[indptr[right]:indptr[right + 1]]
    right_data = data[indptr[right]:indptr[right + 1]]

    left_norm = norm(left_data)
    right_norm = norm(right_data)

    if abs(left_norm) < EPS:
        left_norm = 1.0

    if abs(right_norm) < EPS:
        right_norm = 1.0

    # Compute the normal vector to the hyperplane (the vector between
    # the two points)
    normalized_left_data = left_data / left_norm
    normalized_right_data = right_data / right_norm
    hyperplane_inds, hyperplane_data = sparse_diff(left_inds,
                                                   normalized_left_data,
                                                   right_inds,
                                                   normalized_right_data)

    hyperplane_norm = norm(hyperplane_data)
    if abs(hyperplane_norm) < EPS:
        hyperplane_norm = 1.0
    for d in range(hyperplane_data.shape[0]):
        hyperplane_data[d] = hyperplane_data[d] / hyperplane_norm

    # For each point compute the margin (project into normal vector)
    # If we are on lower side of the hyperplane put in one pile, otherwise
    # put it in the other pile (if we hit hyperplane on the nose, flip a coin)
    n_left = 0
    n_right = 0
    side = np.empty(indices.shape[0], np.int8)
    for i in range(indices.shape[0]):
        margin = 0.0

        i_inds = inds[indptr[indices[i]]:indptr[indices[i] + 1]]
        i_data = data[indptr[indices[i]]:indptr[indices[i] + 1]]

        _, mul_data = sparse_mul(hyperplane_inds, hyperplane_data, i_inds,
                                 i_data)
        for d in range(mul_data.shape[0]):
            margin += mul_data[d]

        if abs(margin) < EPS:
            side[i] = tau_rand_int(rng_state) % 2
            if side[i] == 0:
                n_left += 1
            else:
                n_right += 1
        elif margin > 0:
            side[i] = 0
            n_left += 1
        else:
            side[i] = 1
            n_right += 1

    # Now that we have the counts allocate arrays
    indices_left = np.empty(n_left, dtype=np.int64)
    indices_right = np.empty(n_right, dtype=np.int64)

    # Populate the arrays with indices according to which side they fell on
    n_left = 0
    n_right = 0
    for i in range(side.shape[0]):
        if side[i] == 0:
            indices_left[n_left] = indices[i]
            n_left += 1
        else:
            indices_right[n_right] = indices[i]
            n_right += 1

    hyperplane = np.vstack((hyperplane_inds, hyperplane_data))

    return indices_left, indices_right, hyperplane, None