def test_tril_triu_errors(): A = np.random.randint(0, 11, (10, 10, 10)) dA = da.from_array(A, chunks=(5, 5, 5)) pytest.raises(ValueError, lambda: da.triu(dA)) A = np.random.randint(0, 11, (30, 35)) dA = da.from_array(A, chunks=(5, 5)) pytest.raises(NotImplementedError, lambda: da.triu(dA))
def test_tril_triu(): A = np.random.randn(20, 20) for chk in [5, 4]: dA = da.from_array(A, (chk, chk)) assert np.allclose(da.triu(dA).compute(), np.triu(A)) assert np.allclose(da.tril(dA).compute(), np.tril(A)) for k in [-25, -20, -19, -15, -14, -9, -8, -6, -5, -1, 1, 4, 5, 6, 8, 10, 11, 15, 16, 19, 20, 21]: assert np.allclose(da.triu(dA, k).compute(), np.triu(A, k)) assert np.allclose(da.tril(dA, k).compute(), np.tril(A, k))
def test_tsqr_zero_height_chunks(): m_q = 10 n_q = 5 m_r = 5 n_r = 5 # certainty mat = np.random.rand(10, 5) x = da.from_array(mat, chunks=((4, 0, 1, 0, 5), (5,))) q, r = da.linalg.qr(x) assert_eq((m_q, n_q), q.shape) # shape check assert_eq((m_r, n_r), r.shape) # shape check assert_eq(mat, da.dot(q, r)) # accuracy check assert_eq(np.eye(n_q, n_q), da.dot(q.T, q)) # q must be orthonormal assert_eq(r, da.triu(r.rechunk(r.shape[0]))) # r must be upper triangular # uncertainty mat2 = np.vstack([mat, -np.ones((10, 5))]) v2 = mat2[:, 0] x2 = da.from_array(mat2, chunks=5) c = da.from_array(v2, chunks=5) x = x2[c >= 0, :] # remove the ones added above to yield mat q, r = da.linalg.qr(x) q = q.compute() # because uncertainty r = r.compute() assert_eq((m_q, n_q), q.shape) # shape check assert_eq((m_r, n_r), r.shape) # shape check assert_eq(mat, np.dot(q, r)) # accuracy check assert_eq(np.eye(n_q, n_q), np.dot(q.T, q)) # q must be orthonormal assert_eq(r, np.triu(r)) # r must be upper triangular
def test_qr(m, n, chunks, error_type): mat = np.random.rand(m, n) data = da.from_array(mat, chunks=chunks, name='A') m_q = m n_q = min(m, n) m_r = n_q n_r = n m_qtq = n_q if error_type is None: q, r = qr(data) assert_eq((m_q, n_q), q.shape) # shape check assert_eq((m_r, n_r), r.shape) # shape check assert_eq(mat, da.dot(q, r)) # accuracy check assert_eq(np.eye(m_qtq, m_qtq), da.dot(q.T, q)) # q must be orthonormal assert_eq(r, da.triu(r.rechunk(r.shape[0]))) # r must be upper triangular else: with pytest.raises(error_type): q, r = qr(data)
def test_tsqr(m, n, chunks, error_type): mat = np.random.rand(m, n) data = da.from_array(mat, chunks=chunks, name='A') m_q = m n_q = min(m, n) m_r = n_q n_r = n m_qtq = n_q if error_type is None: q, r = tsqr(data) assert_eq((m_q, n_q), q.shape) # shape check assert_eq((m_r, n_r), r.shape) # shape check assert_eq(mat, da.dot(q, r)) # accuracy check assert_eq(np.eye(m_qtq, m_qtq), da.dot(q.T, q)) # q must be orthonormal assert_eq(r, da.triu(r.rechunk(r.shape[0]))) # r must be upper triangular else: with pytest.raises(error_type): q, r = tsqr(data)
def test_tsqr(m, n, chunks, error_type): mat = np.random.rand(m, n) data = da.from_array(mat, chunks=chunks, name="A") # qr m_q = m n_q = min(m, n) m_r = n_q n_r = n # svd m_u = m n_u = min(m, n) n_s = n_q m_vh = n_q n_vh = n d_vh = max(m_vh, n_vh) # full matrix returned if error_type is None: # test QR q, r = tsqr(data) assert_eq((m_q, n_q), q.shape) # shape check assert_eq((m_r, n_r), r.shape) # shape check assert_eq(mat, da.dot(q, r)) # accuracy check assert_eq(np.eye(n_q, n_q), da.dot(q.T, q)) # q must be orthonormal assert_eq(r, da.triu(r.rechunk(r.shape[0]))) # r must be upper triangular # test SVD u, s, vh = tsqr(data, compute_svd=True) s_exact = np.linalg.svd(mat)[1] assert_eq(s, s_exact) # s must contain the singular values assert_eq((m_u, n_u), u.shape) # shape check assert_eq((n_s,), s.shape) # shape check assert_eq((d_vh, d_vh), vh.shape) # shape check assert_eq(np.eye(n_u, n_u), da.dot(u.T, u)) # u must be orthonormal assert_eq(np.eye(d_vh, d_vh), da.dot(vh, vh.T)) # vh must be orthonormal assert_eq(mat, da.dot(da.dot(u, da.diag(s)), vh[:n_q])) # accuracy check else: with pytest.raises(error_type): q, r = tsqr(data) with pytest.raises(error_type): u, s, vh = tsqr(data, compute_svd=True)
def test_tsqr(m, n, chunks, error_type): mat = np.random.rand(m, n) data = da.from_array(mat, chunks=chunks, name='A') # qr m_q = m n_q = min(m, n) m_r = n_q n_r = n # svd m_u = m n_u = min(m, n) n_s = n_q m_vh = n_q n_vh = n d_vh = max(m_vh, n_vh) # full matrix returned if error_type is None: # test QR q, r = tsqr(data) assert_eq((m_q, n_q), q.shape) # shape check assert_eq((m_r, n_r), r.shape) # shape check assert_eq(mat, da.dot(q, r)) # accuracy check assert_eq(np.eye(n_q, n_q), da.dot(q.T, q)) # q must be orthonormal assert_eq(r, da.triu(r.rechunk(r.shape[0]))) # r must be upper triangular # test SVD u, s, vh = tsqr(data, compute_svd=True) s_exact = np.linalg.svd(mat)[1] assert_eq(s, s_exact) # s must contain the singular values assert_eq((m_u, n_u), u.shape) # shape check assert_eq((n_s,), s.shape) # shape check assert_eq((d_vh, d_vh), vh.shape) # shape check assert_eq(np.eye(n_u, n_u), da.dot(u.T, u)) # u must be orthonormal assert_eq(np.eye(d_vh, d_vh), da.dot(vh, vh.T)) # vh must be orthonormal assert_eq(mat, da.dot(da.dot(u, da.diag(s)), vh[:n_q])) # accuracy check else: with pytest.raises(error_type): q, r = tsqr(data) with pytest.raises(error_type): u, s, vh = tsqr(data, compute_svd=True)
def test_tril_triu_non_square_arrays(): A = np.random.randint(0, 11, (30, 35)) dA = da.from_array(A, chunks=(5, 5)) assert_eq(da.triu(dA), np.triu(A)) assert_eq(da.tril(dA), np.tril(A))
def test_tril_triu_errors(): A = np.random.randint(0, 11, (10, 10, 10)) dA = da.from_array(A, chunks=(5, 5, 5)) pytest.raises(ValueError, lambda: da.triu(dA))
def _check_lu_result(p, l, u, A): assert np.allclose(p.dot(l).dot(u), A) # check triangulars assert_eq(l, da.tril(l), check_graph=False) assert_eq(u, da.triu(u), check_graph=False)
def _check_lu_result(p, l, u, A): assert np.allclose(p.dot(l).dot(u), A) # check triangulars assert_eq(l, da.tril(l)) assert_eq(u, da.triu(u))
def pairwise_distance( x: ArrayLike, metric: str = "euclidean", ) -> np.ndarray: """Calculates the pairwise distance between all pairs of row vectors in the given two dimensional array x. To illustrate the algorithm consider the following (4, 5) two dimensional array: [e.00, e.01, e.02, e.03, e.04] [e.10, e.11, e.12, e.13, e.14] [e.20, e.21, e.22, e.23, e.24] [e.30, e.31, e.32, e.33, e.34] The rows of the above matrix are the set of vectors. Now let's label all the vectors as v0, v1, v2, v3. The result will be a two dimensional symmetric matrix which will contain the distance between all pairs. Since there are 4 vectors, calculating the distance between each vector and every other vector, will result in 16 distances and the resultant array will be of size (4, 4) as follows: [v0.v0, v0.v1, v0.v2, v0.v3] [v1.v0, v1.v1, v1.v2, v1.v3] [v2.v0, v2.v1, v2.v2, v2.v3] [v3.v0, v3.v1, v3.v2, v3.v3] The (i, j) position in the resulting array (matrix) denotes the distance between vi and vj vectors. Negative and nan values are considered as missing values. They are ignored for all distance metric calculations. Parameters ---------- x [array-like, shape: (M, N)] An array like two dimensional matrix. The rows are the vectors used for comparison, i.e. for pairwise distance. metric The distance metric to use. The distance function can be 'euclidean' or 'correlation'. Returns ------- [array-like, shape: (M, M)] A two dimensional distance matrix, which will be symmetric. The dimension will be (M, M). The (i, j) position in the resulting array (matrix) denotes the distance between ith and jth row vectors in the input array. Examples -------- >>> from sgkit.distance.api import pairwise_distance >>> import dask.array as da >>> x = da.array([[6, 4, 1,], [4, 5, 2], [9, 7, 3]]).rechunk(2, 2) >>> pairwise_distance(x, metric='euclidean') array([[0. , 2.44948974, 4.69041576], [2.44948974, 0. , 5.47722558], [4.69041576, 5.47722558, 0. ]]) >>> import numpy as np >>> x = np.array([[6, 4, 1,], [4, 5, 2], [9, 7, 3]]) >>> pairwise_distance(x, metric='euclidean') array([[0. , 2.44948974, 4.69041576], [2.44948974, 0. , 5.47722558], [4.69041576, 5.47722558, 0. ]]) >>> x = np.array([[6, 4, 1,], [4, 5, 2], [9, 7, 3]]) >>> pairwise_distance(x, metric='correlation') array([[1.11022302e-16, 2.62956526e-01, 2.82353505e-03], [2.62956526e-01, 0.00000000e+00, 2.14285714e-01], [2.82353505e-03, 2.14285714e-01, 0.00000000e+00]]) """ try: metric_ufunc = getattr(metrics, metric) except AttributeError: raise NotImplementedError(f"Given metric: {metric} is not implemented.") x = da.asarray(x) x_distance = da.blockwise( # Lambda wraps reshape for broadcast lambda _x, _y: metric_ufunc(_x[:, None, :], _y), "jk", x, "ji", x, "ki", dtype="float64", concatenate=True, ) x_distance = da.triu(x_distance, 1) + da.triu(x_distance).T return x_distance.compute()
def pairwise_distance( x: ArrayLike, metric: MetricTypes = "euclidean", split_every: typing.Optional[int] = None, ) -> da.array: """Calculates the pairwise distance between all pairs of row vectors in the given two dimensional array x. To illustrate the algorithm consider the following (4, 5) two dimensional array: [e.00, e.01, e.02, e.03, e.04] [e.10, e.11, e.12, e.13, e.14] [e.20, e.21, e.22, e.23, e.24] [e.30, e.31, e.32, e.33, e.34] The rows of the above matrix are the set of vectors. Now let's label all the vectors as v0, v1, v2, v3. The result will be a two dimensional symmetric matrix which will contain the distance between all pairs. Since there are 4 vectors, calculating the distance between each vector and every other vector, will result in 16 distances and the resultant array will be of size (4, 4) as follows: [v0.v0, v0.v1, v0.v2, v0.v3] [v1.v0, v1.v1, v1.v2, v1.v3] [v2.v0, v2.v1, v2.v2, v2.v3] [v3.v0, v3.v1, v3.v2, v3.v3] The (i, j) position in the resulting array (matrix) denotes the distance between vi and vj vectors. Negative and nan values are considered as missing values. They are ignored for all distance metric calculations. Parameters ---------- x [array-like, shape: (M, N)] An array like two dimensional matrix. The rows are the vectors used for comparison, i.e. for pairwise distance. metric The distance metric to use. The distance function can be 'euclidean' or 'correlation'. split_every Determines the depth of the recursive aggregation in the reduction step. This argument is directly passed to the call to``dask.reduction`` function in the reduce step of the map reduce. Omit to let dask heuristically decide a good default. A default can also be set globally with the split_every key in dask.config. Returns ------- [array-like, shape: (M, M)] A two dimensional distance matrix, which will be symmetric. The dimension will be (M, M). The (i, j) position in the resulting array (matrix) denotes the distance between ith and jth row vectors in the input array. Examples -------- >>> from sgkit.distance.api import pairwise_distance >>> import dask.array as da >>> x = da.array([[6, 4, 1,], [4, 5, 2], [9, 7, 3]]).rechunk(2, 2) >>> pairwise_distance(x, metric='euclidean').compute() array([[0. , 2.44948974, 4.69041576], [2.44948974, 0. , 5.47722558], [4.69041576, 5.47722558, 0. ]]) >>> import numpy as np >>> x = np.array([[6, 4, 1,], [4, 5, 2], [9, 7, 3]]) >>> pairwise_distance(x, metric='euclidean').compute() array([[0. , 2.44948974, 4.69041576], [2.44948974, 0. , 5.47722558], [4.69041576, 5.47722558, 0. ]]) >>> x = np.array([[6, 4, 1,], [4, 5, 2], [9, 7, 3]]) >>> pairwise_distance(x, metric='correlation').compute() array([[-4.44089210e-16, 2.62956526e-01, 2.82353505e-03], [ 2.62956526e-01, 0.00000000e+00, 2.14285714e-01], [ 2.82353505e-03, 2.14285714e-01, 0.00000000e+00]]) """ try: metric_map_func = getattr(metrics, f"{metric}_map") metric_reduce_func = getattr(metrics, f"{metric}_reduce") n_map_param = metrics.N_MAP_PARAM[metric] except AttributeError: raise NotImplementedError( f"Given metric: {metric} is not implemented.") x = da.asarray(x) if x.ndim != 2: raise ValueError(f"2-dimensional array expected, got '{x.ndim}'") # setting this variable outside of _pairwise to avoid it's recreation # in every iteration, which eventually leads to increase in dask # graph serialisation/deserialisation time significantly metric_param = np.empty(n_map_param, dtype=x.dtype) def _pairwise(f: ArrayLike, g: ArrayLike) -> ArrayLike: result: ArrayLike = metric_map_func(f[:, None, :], g, metric_param) # Adding a new axis to help combine chunks along this axis in the # reduction step (see the _aggregate and _combine functions below). return result[..., np.newaxis] # concatenate in blockwise leads to high memory footprints, so instead # we perform blockwise without contraction followed by reduction. # More about this issue: https://github.com/dask/dask/issues/6874 out = da.blockwise( _pairwise, "ijk", x, "ik", x, "jk", dtype=x.dtype, concatenate=False, ) def _aggregate(x_chunk: ArrayLike, **_: typing.Any) -> ArrayLike: """Last function to be executed when resolving the dask graph, producing the final output. It is always invoked, even when the reduced Array counts a single chunk along the reduced axes.""" x_chunk = x_chunk.reshape(x_chunk.shape[:-2] + (-1, n_map_param)) result: ArrayLike = metric_reduce_func(x_chunk) return result def _chunk(x_chunk: ArrayLike, **_: typing.Any) -> ArrayLike: return x_chunk def _combine(x_chunk: ArrayLike, **_: typing.Any) -> ArrayLike: """Function used for intermediate recursive aggregation (see split_every argument to ``da.reduction below``). If the reduction can be performed in less than 3 steps, it will not be invoked at all.""" # reduce chunks by summing along the -2 axis x_chunk_reshaped = x_chunk.reshape(x_chunk.shape[:-2] + (-1, n_map_param)) return x_chunk_reshaped.sum(axis=-2)[..., np.newaxis] r = da.reduction( out, chunk=_chunk, combine=_combine, aggregate=_aggregate, axis=-1, dtype=x.dtype, meta=np.ndarray((0, 0), dtype=x.dtype), split_every=split_every, name="pairwise", ) t = da.triu(r) return t + t.T