def test_reductions_2D_int(): x = np.arange(1, 122).reshape((11, 11)).astype('i4') a = da.from_array(x, chunks=(4, 4)) reduction_2d_test(da.sum, a, np.sum, x) reduction_2d_test(da.prod, a, np.prod, x) reduction_2d_test(da.mean, a, np.mean, x) reduction_2d_test(da.var, a, np.var, x, False) # Difference in dtype algo reduction_2d_test(da.std, a, np.std, x, False) # Difference in dtype algo reduction_2d_test(da.min, a, np.min, x, False) reduction_2d_test(da.max, a, np.max, x, False) reduction_2d_test(da.any, a, np.any, x, False) reduction_2d_test(da.all, a, np.all, x, False) reduction_2d_test(da.nansum, a, np.nansum, x) with ignoring(AttributeError): reduction_2d_test(da.nanprod, a, np.nanprod, x) reduction_2d_test(da.nanmean, a, np.mean, x) reduction_2d_test(da.nanvar, a, np.nanvar, x, False) # Difference in dtype algo reduction_2d_test(da.nanstd, a, np.nanstd, x, False) # Difference in dtype algo reduction_2d_test(da.nanmin, a, np.nanmin, x, False) reduction_2d_test(da.nanmax, a, np.nanmax, x, False) assert eq(da.argmax(a, axis=0), np.argmax(x, axis=0)) assert eq(da.argmin(a, axis=0), np.argmin(x, axis=0)) assert eq(da.nanargmax(a, axis=0), np.nanargmax(x, axis=0)) assert eq(da.nanargmin(a, axis=0), np.nanargmin(x, axis=0)) assert eq(da.argmax(a, axis=1), np.argmax(x, axis=1)) assert eq(da.argmin(a, axis=1), np.argmin(x, axis=1)) assert eq(da.nanargmax(a, axis=1), np.nanargmax(x, axis=1)) assert eq(da.nanargmin(a, axis=1), np.nanargmin(x, axis=1))
def test_reductions_1D(dtype): x = np.arange(5).astype(dtype) a = da.from_array(x, chunks=(2,)) reduction_1d_test(da.sum, a, np.sum, x) reduction_1d_test(da.prod, a, np.prod, x) reduction_1d_test(da.mean, a, np.mean, x) reduction_1d_test(da.var, a, np.var, x) reduction_1d_test(da.std, a, np.std, x) reduction_1d_test(da.min, a, np.min, x, False) reduction_1d_test(da.max, a, np.max, x, False) reduction_1d_test(da.any, a, np.any, x, False) reduction_1d_test(da.all, a, np.all, x, False) reduction_1d_test(da.nansum, a, np.nansum, x) with ignoring(AttributeError): reduction_1d_test(da.nanprod, a, np.nanprod, x) reduction_1d_test(da.nanmean, a, np.mean, x) reduction_1d_test(da.nanvar, a, np.var, x) reduction_1d_test(da.nanstd, a, np.std, x) reduction_1d_test(da.nanmin, a, np.nanmin, x, False) reduction_1d_test(da.nanmax, a, np.nanmax, x, False) assert eq(da.argmax(a, axis=0), np.argmax(x, axis=0)) assert eq(da.argmin(a, axis=0), np.argmin(x, axis=0)) assert eq(da.nanargmax(a, axis=0), np.nanargmax(x, axis=0)) assert eq(da.nanargmin(a, axis=0), np.nanargmin(x, axis=0)) assert eq(da.argmax(a, axis=0, split_every=2), np.argmax(x, axis=0)) assert eq(da.argmin(a, axis=0, split_every=2), np.argmin(x, axis=0)) assert eq(da.nanargmax(a, axis=0, split_every=2), np.nanargmax(x, axis=0)) assert eq(da.nanargmin(a, axis=0, split_every=2), np.nanargmin(x, axis=0))
def test_reductions_2D_nans(): # chunks are a mix of some/all/no NaNs x = np.full((4, 4), np.nan) x[:2, :2] = np.array([[1, 2], [3, 4]]) x[2, 2] = 5 x[3, 3] = 6 a = da.from_array(x, chunks=(2, 2)) reduction_2d_test(da.sum, a, np.sum, x, False, False) reduction_2d_test(da.prod, a, np.prod, x, False, False) reduction_2d_test(da.mean, a, np.mean, x, False, False) reduction_2d_test(da.var, a, np.var, x, False, False) reduction_2d_test(da.std, a, np.std, x, False, False) reduction_2d_test(da.min, a, np.min, x, False, False) reduction_2d_test(da.max, a, np.max, x, False, False) reduction_2d_test(da.any, a, np.any, x, False, False) reduction_2d_test(da.all, a, np.all, x, False, False) reduction_2d_test(da.nansum, a, np.nansum, x, False, False) reduction_2d_test(da.nanprod, a, np.nanprod, x, False, False) reduction_2d_test(da.nanmean, a, np.nanmean, x, False, False) with pytest.warns(None): # division by 0 warning reduction_2d_test(da.nanvar, a, np.nanvar, x, False, False) with pytest.warns(None): # division by 0 warning reduction_2d_test(da.nanstd, a, np.nanstd, x, False, False) with pytest.warns(None): # all NaN axis warning reduction_2d_test(da.nanmin, a, np.nanmin, x, False, False) with pytest.warns(None): # all NaN axis warning reduction_2d_test(da.nanmax, a, np.nanmax, x, False, False) assert_eq(da.argmax(a), np.argmax(x)) assert_eq(da.argmin(a), np.argmin(x)) with pytest.warns(None): # all NaN axis warning assert_eq(da.nanargmax(a), np.nanargmax(x)) with pytest.warns(None): # all NaN axis warning assert_eq(da.nanargmin(a), np.nanargmin(x)) assert_eq(da.argmax(a, axis=0), np.argmax(x, axis=0)) assert_eq(da.argmin(a, axis=0), np.argmin(x, axis=0)) with pytest.warns(None): # all NaN axis warning assert_eq(da.nanargmax(a, axis=0), np.nanargmax(x, axis=0)) with pytest.warns(None): # all NaN axis warning assert_eq(da.nanargmin(a, axis=0), np.nanargmin(x, axis=0)) assert_eq(da.argmax(a, axis=1), np.argmax(x, axis=1)) assert_eq(da.argmin(a, axis=1), np.argmin(x, axis=1)) with pytest.warns(None): # all NaN axis warning assert_eq(da.nanargmax(a, axis=1), np.nanargmax(x, axis=1)) with pytest.warns(None): # all NaN axis warning assert_eq(da.nanargmin(a, axis=1), np.nanargmin(x, axis=1))
def test_reductions_2D_nans(): # chunks are a mix of some/all/no NaNs x = np.full((4, 4), np.nan) x[:2, :2] = np.array([[1, 2], [3, 4]]) x[2, 2] = 5 x[3, 3] = 6 a = da.from_array(x, chunks=(2, 2)) reduction_2d_test(da.sum, a, np.sum, x, False, False) reduction_2d_test(da.prod, a, np.prod, x, False, False) reduction_2d_test(da.mean, a, np.mean, x, False, False) reduction_2d_test(da.var, a, np.var, x, False, False) reduction_2d_test(da.std, a, np.std, x, False, False) reduction_2d_test(da.min, a, np.min, x, False, False) reduction_2d_test(da.max, a, np.max, x, False, False) reduction_2d_test(da.any, a, np.any, x, False, False) reduction_2d_test(da.all, a, np.all, x, False, False) reduction_2d_test(da.nansum, a, np.nansum, x, False, False) reduction_2d_test(da.nanprod, a, nanprod, x, False, False) reduction_2d_test(da.nanmean, a, np.nanmean, x, False, False) with pytest.warns(None): # division by 0 warning reduction_2d_test(da.nanvar, a, np.nanvar, x, False, False) with pytest.warns(None): # division by 0 warning reduction_2d_test(da.nanstd, a, np.nanstd, x, False, False) with pytest.warns(None): # all NaN axis warning reduction_2d_test(da.nanmin, a, np.nanmin, x, False, False) with pytest.warns(None): # all NaN axis warning reduction_2d_test(da.nanmax, a, np.nanmax, x, False, False) assert_eq(da.argmax(a), np.argmax(x)) assert_eq(da.argmin(a), np.argmin(x)) with pytest.warns(None): # all NaN axis warning assert_eq(da.nanargmax(a), np.nanargmax(x)) with pytest.warns(None): # all NaN axis warning assert_eq(da.nanargmin(a), np.nanargmin(x)) assert_eq(da.argmax(a, axis=0), np.argmax(x, axis=0)) assert_eq(da.argmin(a, axis=0), np.argmin(x, axis=0)) with pytest.warns(None): # all NaN axis warning assert_eq(da.nanargmax(a, axis=0), np.nanargmax(x, axis=0)) with pytest.warns(None): # all NaN axis warning assert_eq(da.nanargmin(a, axis=0), np.nanargmin(x, axis=0)) assert_eq(da.argmax(a, axis=1), np.argmax(x, axis=1)) assert_eq(da.argmin(a, axis=1), np.argmin(x, axis=1)) with pytest.warns(None): # all NaN axis warning assert_eq(da.nanargmax(a, axis=1), np.nanargmax(x, axis=1)) with pytest.warns(None): # all NaN axis warning assert_eq(da.nanargmin(a, axis=1), np.nanargmin(x, axis=1))
def test_reductions_1D_int(): x = np.arange(5).astype('i4') a = da.from_array(x, chunks=(2, )) reduction_1d_test(da.sum, a, np.sum, x) reduction_1d_test(da.prod, a, np.prod, x) reduction_1d_test(da.mean, a, np.mean, x) reduction_1d_test(da.var, a, np.var, x) reduction_1d_test(da.std, a, np.std, x) reduction_1d_test(da.min, a, np.min, x, False) reduction_1d_test(da.max, a, np.max, x, False) reduction_1d_test(da.any, a, np.any, x, False) reduction_1d_test(da.all, a, np.all, x, False) reduction_1d_test(da.nansum, a, np.nansum, x) with ignoring(AttributeError): reduction_1d_test(da.nanprod, a, np.nanprod, x) reduction_1d_test(da.nanmean, a, np.mean, x) reduction_1d_test(da.nanvar, a, np.var, x) reduction_1d_test(da.nanstd, a, np.std, x) reduction_1d_test(da.nanmin, a, np.nanmin, x, False) reduction_1d_test(da.nanmax, a, np.nanmax, x, False) assert eq(da.argmax(a, axis=0), np.argmax(x, axis=0)) assert eq(da.argmin(a, axis=0), np.argmin(x, axis=0)) assert eq(da.nanargmax(a, axis=0), np.nanargmax(x, axis=0)) assert eq(da.nanargmin(a, axis=0), np.nanargmin(x, axis=0))
def kmeans(data, k=None, centroids=None, steps=100): """Divide the observations in data into clusters using the k-means algorithm, and return an array of integers assigning each data point to one of the clusters. centroids, if supplied, must be an array giving the initial position of the centroids of each cluster. If centroids is omitted, the number k gives the number of clusters and the initial positions of the centroids are selected randomly from the data. The k-means algorithm adjusts the centroids iteratively for the given number of steps, or until no further progress can be made. >>> data = np.array([[12, 10, 87], ... [ 2, 12, 33], ... [68, 31, 32], ... [88, 13, 66], ... [79, 40, 89], ... [ 1, 77, 12]]) >>> np.random.seed(73) >>> kmeans(data, k=3) array([1, 1, 2, 2, 0, 1]) """ if centroids is not None and k is not None: assert (k == len(centroids)) elif centroids is not None: k = len(centroids) elif k is not None: # Forgy initialization method: choose k data points randomly. centroids = data[np.random.choice(np.arange(len(data)), k, False)] else: raise RuntimeError("Need a value for k or centroids.") da_data = da.from_array(data, chunks=multiprocessing.cpu_count()) da_centroids = da.from_array(centroids, chunks=multiprocessing.cpu_count()) i = 0 for _ in range(max(steps, 1)): print("Iteration : ", i) i += 1 # Squared distances between each point and each centroid. sqdists = euclidean(da_centroids, da_data) # Index of the closest centroid to each data point. da_clusters = da.argmin(sqdists, axis=0) da_new_centroids = cluster_centroids(da_data, da_clusters, k) if np.array_equal(da_new_centroids.compute(), da_centroids.compute()): break da_centroids = da_new_centroids return da_clusters, da_centroids
def test_reductions_2D_nans(): # chunks are a mix of some/all/no NaNs x = np.full((4, 4), np.nan) x[:2, :2] = np.array([[1, 2], [3, 4]]) x[2, 2] = 5 x[3, 3] = 6 a = da.from_array(x, chunks=(2, 2)) reduction_2d_test(da.sum, a, np.sum, x, False, False) reduction_2d_test(da.prod, a, np.prod, x, False, False) reduction_2d_test(da.mean, a, np.mean, x, False, False) reduction_2d_test(da.var, a, np.var, x, False, False) reduction_2d_test(da.std, a, np.std, x, False, False) reduction_2d_test(da.min, a, np.min, x, False, False) reduction_2d_test(da.max, a, np.max, x, False, False) reduction_2d_test(da.any, a, np.any, x, False, False) reduction_2d_test(da.all, a, np.all, x, False, False) reduction_2d_test(da.nansum, a, np.nansum, x, False, False) reduction_2d_test(da.nanprod, a, np.nanprod, x, False, False) with warnings.catch_warnings(): warnings.simplefilter("ignore", RuntimeWarning) reduction_2d_test(da.nanmean, a, np.nanmean, x, False, False) reduction_2d_test(da.nanvar, a, np.nanvar, x, False, False) reduction_2d_test(da.nanstd, a, np.nanstd, x, False, False) reduction_2d_test(da.nanmin, a, np.nanmin, x, False, False) reduction_2d_test(da.nanmax, a, np.nanmax, x, False, False) assert_eq(da.argmax(a), np.argmax(x)) assert_eq(da.argmin(a), np.argmin(x)) assert_eq(da.nanargmax(a), np.nanargmax(x)) assert_eq(da.nanargmin(a), np.nanargmin(x)) assert_eq(da.argmax(a, axis=0), np.argmax(x, axis=0)) assert_eq(da.argmin(a, axis=0), np.argmin(x, axis=0)) assert_eq(da.nanargmax(a, axis=0), np.nanargmax(x, axis=0)) assert_eq(da.nanargmin(a, axis=0), np.nanargmin(x, axis=0)) assert_eq(da.argmax(a, axis=1), np.argmax(x, axis=1)) assert_eq(da.argmin(a, axis=1), np.argmin(x, axis=1)) assert_eq(da.nanargmax(a, axis=1), np.nanargmax(x, axis=1)) assert_eq(da.nanargmin(a, axis=1), np.nanargmin(x, axis=1))
def test_reductions_2D(dtype): x = np.arange(1, 122).reshape((11, 11)).astype(dtype) a = da.from_array(x, chunks=(4, 4)) b = a.sum(keepdims=True) assert b._keys() == [[(b.name, 0, 0)]] reduction_2d_test(da.sum, a, np.sum, x) reduction_2d_test(da.prod, a, np.prod, x) reduction_2d_test(da.mean, a, np.mean, x) reduction_2d_test(da.var, a, np.var, x, False) # Difference in dtype algo reduction_2d_test(da.std, a, np.std, x, False) # Difference in dtype algo reduction_2d_test(da.min, a, np.min, x, False) reduction_2d_test(da.max, a, np.max, x, False) reduction_2d_test(da.any, a, np.any, x, False) reduction_2d_test(da.all, a, np.all, x, False) reduction_2d_test(da.nansum, a, np.nansum, x) with ignoring(AttributeError): reduction_2d_test(da.nanprod, a, np.nanprod, x) reduction_2d_test(da.nanmean, a, np.mean, x) reduction_2d_test(da.nanvar, a, np.nanvar, x, False) # Difference in dtype algo reduction_2d_test(da.nanstd, a, np.nanstd, x, False) # Difference in dtype algo reduction_2d_test(da.nanmin, a, np.nanmin, x, False) reduction_2d_test(da.nanmax, a, np.nanmax, x, False) assert eq(da.argmax(a, axis=0), np.argmax(x, axis=0)) assert eq(da.argmin(a, axis=0), np.argmin(x, axis=0)) assert eq(da.nanargmax(a, axis=0), np.nanargmax(x, axis=0)) assert eq(da.nanargmin(a, axis=0), np.nanargmin(x, axis=0)) assert eq(da.argmax(a, axis=1), np.argmax(x, axis=1)) assert eq(da.argmin(a, axis=1), np.argmin(x, axis=1)) assert eq(da.nanargmax(a, axis=1), np.nanargmax(x, axis=1)) assert eq(da.nanargmin(a, axis=1), np.nanargmin(x, axis=1)) assert eq(da.argmax(a, axis=0, split_every=2), np.argmax(x, axis=0)) assert eq(da.argmin(a, axis=0, split_every=2), np.argmin(x, axis=0)) assert eq(da.nanargmax(a, axis=0, split_every=2), np.nanargmax(x, axis=0)) assert eq(da.nanargmin(a, axis=0, split_every=2), np.nanargmin(x, axis=0)) assert eq(da.argmax(a, axis=1, split_every=2), np.argmax(x, axis=1)) assert eq(da.argmin(a, axis=1, split_every=2), np.argmin(x, axis=1)) assert eq(da.nanargmax(a, axis=1, split_every=2), np.nanargmax(x, axis=1)) assert eq(da.nanargmin(a, axis=1, split_every=2), np.nanargmin(x, axis=1))
def test_reductions_2D_nans(): # chunks are a mix of some/all/no NaNs x = np.full((4, 4), np.nan) x[:2, :2] = np.array([[1, 2], [3, 4]]) x[2, 2] = 5 x[3, 3] = 6 a = da.from_array(x, chunks=(2, 2)) reduction_2d_test(da.sum, a, np.sum, x, False, False) reduction_2d_test(da.prod, a, np.prod, x, False, False) reduction_2d_test(da.mean, a, np.mean, x, False, False) reduction_2d_test(da.var, a, np.var, x, False, False) reduction_2d_test(da.std, a, np.std, x, False, False) reduction_2d_test(da.min, a, np.min, x, False, False) reduction_2d_test(da.max, a, np.max, x, False, False) reduction_2d_test(da.any, a, np.any, x, False, False) reduction_2d_test(da.all, a, np.all, x, False, False) reduction_2d_test(da.nansum, a, np.nansum, x, False, False) with ignoring(AttributeError): reduction_2d_test(da.nanprod, a, np.nanprod, x, False, False) reduction_2d_test(da.nanmean, a, np.nanmean, x, False, False) reduction_2d_test(da.nanvar, a, np.nanvar, x, False, False) reduction_2d_test(da.nanstd, a, np.nanstd, x, False, False) reduction_2d_test(da.nanmin, a, np.nanmin, x, False, False) reduction_2d_test(da.nanmax, a, np.nanmax, x, False, False) assert eq(da.argmax(a), np.argmax(x)) assert eq(da.argmin(a), np.argmin(x)) assert eq(da.nanargmax(a), np.nanargmax(x)) assert eq(da.nanargmin(a), np.nanargmin(x)) assert eq(da.argmax(a, axis=0), np.argmax(x, axis=0)) assert eq(da.argmin(a, axis=0), np.argmin(x, axis=0)) assert eq(da.nanargmax(a, axis=0), np.nanargmax(x, axis=0)) assert eq(da.nanargmin(a, axis=0), np.nanargmin(x, axis=0)) assert eq(da.argmax(a, axis=1), np.argmax(x, axis=1)) assert eq(da.argmin(a, axis=1), np.argmin(x, axis=1)) assert eq(da.nanargmax(a, axis=1), np.nanargmax(x, axis=1)) assert eq(da.nanargmin(a, axis=1), np.nanargmin(x, axis=1))
def nanargmin(a, axis=None): fill_value = dtypes.get_pos_infinity(a.dtype) if a.dtype.kind == 'O': return _nan_argminmax_object('argmin', fill_value, a, axis=axis) a, mask = _replace_nan(a, fill_value) if isinstance(a, dask_array_type): res = dask_array.argmin(a, axis=axis) else: res = np.argmin(a, axis=axis) if mask is not None: mask = mask.all(axis=axis) if mask.any(): raise ValueError("All-NaN slice encountered") return res
def nanargmin(a, axis=None): fill_value = dtypes.get_pos_infinity(a.dtype) if a.dtype.kind == "O": return _nan_argminmax_object("argmin", fill_value, a, axis=axis) a, mask = _replace_nan(a, fill_value) if isinstance(a, dask_array_type): res = dask_array.argmin(a, axis=axis) else: res = np.argmin(a, axis=axis) if mask is not None: mask = mask.all(axis=axis) if mask.any(): raise ValueError("All-NaN slice encountered") return res
def nearest_neighbour(test_images, train_images, train_labels, k=1): pred = np.zeros(test_images.shape[0]) for i in range(test_images.shape[0]): test_image = test_images[i, :] nn = da.sum(np.abs(train_images - test_image), axis=1, keepdims=True) if k == 1: nn = da.argmin(nn, axis=0) pred[i] = train_labels[nn] else: nn = np.array(nn) min_idx = np.argsort(nn, 0)[:k] labels = np.array([train_labels[i] for i in min_idx]) labels = np.reshape(labels, [-1]) lab = Counter(labels).most_common()[0][0] pred[i] = lab return pred
def test_linspace(endpoint): darr = da.linspace(6, 49, endpoint=endpoint, chunks=5) nparr = np.linspace(6, 49, endpoint=endpoint) assert_eq(darr, nparr) darr = da.linspace(1.4, 4.9, endpoint=endpoint, chunks=5, num=13) nparr = np.linspace(1.4, 4.9, endpoint=endpoint, num=13) assert_eq(darr, nparr) darr = da.linspace(6, 49, endpoint=endpoint, chunks=5, dtype=float) nparr = np.linspace(6, 49, endpoint=endpoint, dtype=float) assert_eq(darr, nparr) darr, dstep = da.linspace(6, 49, endpoint=endpoint, chunks=5, retstep=True) nparr, npstep = np.linspace(6, 49, endpoint=endpoint, retstep=True) assert np.allclose(dstep, npstep) assert_eq(darr, nparr) darr = da.linspace(1.4, 4.9, endpoint=endpoint, chunks=5, num=13, dtype=int) nparr = np.linspace(1.4, 4.9, num=13, endpoint=endpoint, dtype=int) assert_eq(darr, nparr) assert sorted( da.linspace(1.4, 4.9, endpoint=endpoint, chunks=5, num=13).dask) == sorted( da.linspace(1.4, 4.9, endpoint=endpoint, chunks=5, num=13).dask) assert sorted( da.linspace(6, 49, endpoint=endpoint, chunks=5, dtype=float).dask) == sorted( da.linspace(6, 49, endpoint=endpoint, chunks=5, dtype=float).dask) x = da.array([0.2, 6.4, 3.0, 1.6]) nparr = np.linspace(0, 2, 8, endpoint=endpoint) darr = da.linspace(da.argmin(x), da.argmax(x) + 1, 8, endpoint=endpoint) assert_eq(darr, nparr)
def test_reductions(): x = np.arange(5).astype('f4') a = da.from_array(x, blockshape=(2, )) assert eq(da.all(a), np.all(x)) assert eq(da.any(a), np.any(x)) assert eq(da.argmax(a, axis=0), np.argmax(x, axis=0)) assert eq(da.argmin(a, axis=0), np.argmin(x, axis=0)) assert eq(da.max(a), np.max(x)) assert eq(da.mean(a), np.mean(x)) assert eq(da.min(a), np.min(x)) assert eq(da.nanargmax(a, axis=0), np.nanargmax(x, axis=0)) assert eq(da.nanargmin(a, axis=0), np.nanargmin(x, axis=0)) assert eq(da.nanmax(a), np.nanmax(x)) assert eq(da.nanmin(a), np.nanmin(x)) assert eq(da.nansum(a), np.nansum(x)) assert eq(da.nanvar(a), np.nanvar(x)) assert eq(da.nanstd(a), np.nanstd(x))
def test_reductions(): x = np.arange(5).astype('f4') a = da.from_array(x, chunks=(2,)) assert eq(da.all(a), np.all(x)) assert eq(da.any(a), np.any(x)) assert eq(da.argmax(a, axis=0), np.argmax(x, axis=0)) assert eq(da.argmin(a, axis=0), np.argmin(x, axis=0)) assert eq(da.max(a), np.max(x)) assert eq(da.mean(a), np.mean(x)) assert eq(da.min(a), np.min(x)) assert eq(da.nanargmax(a, axis=0), np.nanargmax(x, axis=0)) assert eq(da.nanargmin(a, axis=0), np.nanargmin(x, axis=0)) assert eq(da.nanmax(a), np.nanmax(x)) assert eq(da.nanmin(a), np.nanmin(x)) assert eq(da.nansum(a), np.nansum(x)) assert eq(da.nanvar(a), np.nanvar(x)) assert eq(da.nanstd(a), np.nanstd(x))
def __call__(self, data: Data, centroids: Centroids) -> IntLabels: """Find closest centroids @param data: observations in rows @param centroids: centroids in rows @return: vector of labels of centroids closest to points """ if data.shape[1] != centroids.shape[1]: msg = ("Dimensionality of data and centroids must be equal. " + f"Was {data.shape[1]} and {centroids.shape[1]}") logging.error(msg) raise ValueError(msg) if self.allow_dask and (data.shape[0] > 10000 or data.shape[1] > 1000): X1 = da.from_array(data) X2 = da.from_array(centroids) distances = ddst.cdist(X1, X2, self.distance_metric) labels = da.argmin(distances, axis=1).compute() else: distances = dst.cdist(data, centroids, self.distance_metric) labels = np.argmin(distances, axis=1) return labels
def _query(self, indexers): X = coords_to_point_array([indexers[c] for c in self._index_coords]) if isinstance(X, np.ndarray) and isinstance(self._index, XoakIndexWrapper): # directly call index wrapper's query method res = self._index.query(X) results = res['indices'][:, 0] else: # Two-stage lazy query with dask import dask import dask.array as da # coerce query array as a dask array and index(es) as an iterable if isinstance(X, np.ndarray): X = da.from_array(X, chunks=X.shape) if isinstance(self._index, XoakIndexWrapper): indexes = [self._index] else: indexes = self._index # 1st "map" stage: # - execute `IndexWrapperCls.query` for each query array chunk and each index instance # - concatenate all distances/positions results in two dask arrays of shape (n_points, n_indexes) res_chunk = [] for i, chunk in enumerate(X.to_delayed().ravel()): res_chunk_idx = [] chunk_npoints = X.chunks[0][i] shape = (chunk_npoints, 1) for idx in indexes: dlyd = dask.delayed(idx.query)(chunk) res_chunk_idx.append( da.from_delayed(dlyd, shape, dtype=XoakIndexWrapper._query_result_dtype) ) res_chunk.append(da.concatenate(res_chunk_idx, axis=1)) map_results = da.concatenate(res_chunk, axis=0) distances = map_results['distances'] indices = map_results['indices'] # 2nd "reduce" stage: # - brute force lookup over the indexes dimension (columns) indices_col = da.argmin(distances, axis=1) results = da.blockwise( lambda arr, icol: np.take_along_axis(arr, icol[:, None], 1), 'i', indices, 'ik', indices_col, 'i', dtype=np.intp, concatenate=True, ) return results
def coclustering(Z, nclusters_row, nclusters_col, errobj, niters, epsilon, col_clusters_init=None, row_clusters_init=None, run_on_worker=False): """ Run the co-clustering, Dask implementation :param Z: m x n data matrix :param nclusters_row: num row clusters :param nclusters_col: number of column clusters :param errobj: convergence threshold for the objective function :param niters: maximum number of iterations :param epsilon: numerical parameter, avoids zero arguments in log :param row_clusters_init: initial row cluster assignment :param col_clusters_init: initial column cluster assignment :param run_on_worker: whether the function is submitted to a Dask worker :return: has converged, number of iterations performed. final row and column clustering, error value """ client = get_client() Z = da.array(Z) if not isinstance(Z, da.Array) else Z [m, n] = Z.shape row_chunks, col_chunks = Z.chunksize row_clusters = da.array(row_clusters_init) \ if row_clusters_init is not None \ else _initialize_clusters(m, nclusters_row, chunks=row_chunks) col_clusters = da.array(col_clusters_init) \ if col_clusters_init is not None \ else _initialize_clusters(n, nclusters_col, chunks=col_chunks) R = _setup_cluster_matrix(nclusters_row, row_clusters) C = _setup_cluster_matrix(nclusters_col, col_clusters) e, old_e = 2 * errobj, 0 s = 0 converged = False Gavg = Z.mean() while (not converged) & (s < niters): logger.debug(f'Iteration # {s} ..') # Calculate cluster based averages # nel_clusters is a matrix with the number of elements per co-cluster # originally computed as: da.dot(da.dot(R.T, da.ones((m, n))), C) nel_row_clusters = da.bincount(row_clusters, minlength=nclusters_row) nel_col_clusters = da.bincount(col_clusters, minlength=nclusters_col) logger.debug('num of populated clusters: row {}, col {}'.format( da.sum(nel_row_clusters > 0).compute(), da.sum(nel_col_clusters > 0).compute())) nel_clusters = da.outer(nel_row_clusters, nel_col_clusters) CoCavg = (da.matmul(da.matmul(R.T, Z), C) + Gavg * epsilon) / \ (nel_clusters + epsilon) # Calculate distance based on row approximation d_row = _distance(Z, da.matmul(C, CoCavg.T), epsilon) # Assign to best row cluster row_clusters = da.argmin(d_row, axis=1) R = _setup_cluster_matrix(nclusters_row, row_clusters) # Calculate distance based on column approximation d_col = _distance(Z.T, da.matmul(R, CoCavg), epsilon) # Assign to best column cluster col_clusters = da.argmin(d_col, axis=1) C = _setup_cluster_matrix(nclusters_col, col_clusters) # Error value (actually just the column components really) old_e = e minvals = da.min(d_col, axis=1) # power 1 divergence, power 2 euclidean e = da.sum(da.power(minvals, 1)) row_clusters, R, col_clusters, C, e = client.persist( [row_clusters, R, col_clusters, C, e]) if run_on_worker: # this is workaround for e.compute() for a function that runs # on a worker with multiple threads # https://github.com/dask/distributed/issues/3827 e = client.compute(e) secede() e = e.result() rejoin() else: e = e.compute() logger.debug(f'Error = {e:+.15e}, dE = {e - old_e:+.15e}') converged = abs(e - old_e) < errobj s = s + 1 if converged: logger.debug(f'Coclustering converged in {s} iterations') else: logger.debug(f'Coclustering not converged in {s} iterations') return converged, s, row_clusters, col_clusters, e
def triclustering(Z, nclusters_row, nclusters_col, nclusters_bnd, errobj, niters, epsilon, row_clusters_init=None, col_clusters_init=None, bnd_clusters_init=None): """ Run the tri-clustering, Dask implementation :param Z: d x m x n data matrix :param nclusters_row: number of row clusters :param nclusters_col: number of column clusters :param nclusters_bnd: number of band clusters :param errobj: convergence threshold for the objective function :param niters: maximum number of iterations :param epsilon: numerical parameter, avoids zero arguments in log :param row_clusters_init: initial row cluster assignment :param col_clusters_init: initial column cluster assignment :param bnd_clusters_init: initial column cluster assignment :return: has converged, number of iterations performed. final row, column, and band clustering, error value """ client = get_client() Z = da.array(Z) if not isinstance(Z, da.Array) else Z [d, m, n] = Z.shape bnd_chunks, row_chunks, col_chunks = Z.chunksize row_clusters = da.array(row_clusters_init) \ if row_clusters_init is not None \ else _initialize_clusters(m, nclusters_row, chunks=row_chunks) col_clusters = da.array(col_clusters_init) \ if col_clusters_init is not None \ else _initialize_clusters(n, nclusters_col, chunks=col_chunks) bnd_clusters = da.array(bnd_clusters_init) \ if bnd_clusters_init is not None \ else _initialize_clusters(d, nclusters_bnd, chunks=bnd_chunks) R = _setup_cluster_matrix(nclusters_row, row_clusters) C = _setup_cluster_matrix(nclusters_col, col_clusters) B = _setup_cluster_matrix(nclusters_bnd, bnd_clusters) e, old_e = 2 * errobj, 0 s = 0 converged = False Gavg = Z.mean() while (not converged) & (s < niters): logger.debug(f'Iteration # {s} ..') # Calculate number of elements in each tri-cluster nel_row_clusters = da.bincount(row_clusters, minlength=nclusters_row) nel_col_clusters = da.bincount(col_clusters, minlength=nclusters_col) nel_bnd_clusters = da.bincount(bnd_clusters, minlength=nclusters_bnd) logger.debug( 'num of populated clusters: row {}, col {}, bnd {}'.format( da.sum(nel_row_clusters > 0).compute(), da.sum(nel_col_clusters > 0).compute(), da.sum(nel_bnd_clusters > 0).compute())) nel_clusters = da.einsum('i,j->ij', nel_row_clusters, nel_col_clusters) nel_clusters = da.einsum('i,jk->ijk', nel_bnd_clusters, nel_clusters) # calculate tri-cluster averages (epsilon takes care of empty clusters) # first sum values in each tri-cluster .. TriCavg = da.einsum('ij,ilm->jlm', B, Z) # .. along band axis TriCavg = da.einsum('ij,kim->kjm', R, TriCavg) # .. along row axis TriCavg = da.einsum('ij,kli->klj', C, TriCavg) # .. along col axis # finally divide by number of elements in each tri-cluster TriCavg = (TriCavg + Gavg * epsilon) / (nel_clusters + epsilon) # unpack tri-cluster averages .. avg_unpck = da.einsum('ij,jkl->ikl', B, TriCavg) # .. along band axis avg_unpck = da.einsum('ij,klj->kli', C, avg_unpck) # .. along col axis # use these for the row cluster assignment idx = (1, 0, 2) d_row = _distance(Z.transpose(idx), avg_unpck.transpose(idx), epsilon) row_clusters = da.argmin(d_row, axis=1) R = _setup_cluster_matrix(nclusters_row, row_clusters) # unpack tri-cluster averages .. avg_unpck = da.einsum('ij,jkl->ikl', B, TriCavg) # .. along band axis avg_unpck = da.einsum('ij,kjl->kil', R, avg_unpck) # .. along row axis # use these for the col cluster assignment idx = (2, 0, 1) d_col = _distance(Z.transpose(idx), avg_unpck.transpose(idx), epsilon) col_clusters = da.argmin(d_col, axis=1) C = _setup_cluster_matrix(nclusters_col, col_clusters) # unpack tri-cluster averages .. avg_unpck = da.einsum('ij,kjl->kil', R, TriCavg) # .. along row axis avg_unpck = da.einsum('ij,klj->kli', C, avg_unpck) # .. along col axis # use these for the band cluster assignment d_bnd = _distance(Z, avg_unpck, epsilon) bnd_clusters = da.argmin(d_bnd, axis=1) B = _setup_cluster_matrix(nclusters_bnd, bnd_clusters) # Error value (actually just the band component really) old_e = e minvals = da.min(d_bnd, axis=1) # power 1 divergence, power 2 euclidean e = da.sum(da.power(minvals, 1)) row_clusters, R, col_clusters, C, bnd_clusters, B, e = client.persist( [row_clusters, R, col_clusters, C, bnd_clusters, B, e]) e = e.compute() logger.debug(f'Error = {e:+.15e}, dE = {e - old_e:+.15e}') converged = abs(e - old_e) < errobj s = s + 1 if converged: logger.debug(f'Triclustering converged in {s} iterations') else: logger.debug(f'Triclustering not converged in {s} iterations') return converged, s, row_clusters, col_clusters, bnd_clusters, e