Exemple #1
0
def from_bcolz(x, chunksize=None, categorize=True, index=None, **kwargs):
    """ Read dask Dataframe from bcolz.ctable

    Parameters
    ----------

    x : bcolz.ctable
        Input data
    chunksize : int (optional)
        The size of blocks to pull out from ctable.  Ideally as large as can
        comfortably fit in memory
    categorize : bool (defaults to True)
        Automatically categorize all string dtypes
    index : string (optional)
        Column to make the index

    See Also
    --------

    from_array: more generic function not optimized for bcolz
    """
    import dask.array as da
    import bcolz
    if isinstance(x, (str, unicode)):
        x = bcolz.ctable(rootdir=x)
    bc_chunklen = max(x[name].chunklen for name in x.names)
    if chunksize is None and bc_chunklen > 10000:
        chunksize = bc_chunklen

    categories = dict()
    if categorize:
        for name in x.names:
            if (np.issubdtype(x.dtype[name], np.string_) or
                    np.issubdtype(x.dtype[name], np.unicode_) or
                    np.issubdtype(x.dtype[name], np.object_)):
                a = da.from_array(x[name], chunks=(chunksize * len(x.names),))
                categories[name] = da.unique(a)

    columns = tuple(x.dtype.names)
    divisions = (0,) + tuple(range(-1, len(x), chunksize))[1:]
    if divisions[-1] != len(x) - 1:
        divisions = divisions + (len(x) - 1,)
    new_name = 'from_bcolz' + next(tokens)
    dsk = dict(((new_name, i),
                (dataframe_from_ctable,
                 x,
                 (slice(i * chunksize, (i + 1) * chunksize),),
                 None, categories))
               for i in range(0, int(ceil(len(x) / chunksize))))

    result = DataFrame(dsk, new_name, columns, divisions)

    if index:
        assert index in x.names
        a = da.from_array(x[index], chunks=(chunksize * len(x.names),))
        q = np.linspace(0, 100, len(x) // chunksize + 2)
        divisions = da.percentile(a, q).compute()
        return set_partition(result, index, divisions, **kwargs)
    else:
        return result
def test_grid_search_dask_inputs():
    # Numpy versions
    np_X, np_y = make_classification(n_samples=15, n_classes=2, random_state=0)
    np_groups = np.random.RandomState(0).randint(0, 3, 15)
    # Dask array versions
    da_X = da.from_array(np_X, chunks=5)
    da_y = da.from_array(np_y, chunks=5)
    da_groups = da.from_array(np_groups, chunks=5)
    # Delayed versions
    del_X = delayed(np_X)
    del_y = delayed(np_y)
    del_groups = delayed(np_groups)

    cv = GroupKFold()
    clf = SVC(random_state=0)
    grid = {'C': [1]}

    sol = SVC(C=1, random_state=0).fit(np_X, np_y).support_vectors_

    for X, y, groups in product([np_X, da_X, del_X],
                                [np_y, da_y, del_y],
                                [np_groups, da_groups, del_groups]):
        gs = dcv.GridSearchCV(clf, grid, cv=cv)

        with pytest.raises(ValueError) as exc:
            gs.fit(X, y)
        assert "The groups parameter should not be None" in str(exc.value)

        gs.fit(X, y, groups=groups)
        np.testing.assert_allclose(sol, gs.best_estimator_.support_vectors_)
def test_apply_dask_multiple_inputs():
    import dask.array as da

    def covariance(x, y):
        return ((x - x.mean(axis=-1, keepdims=True)) *
                (y - y.mean(axis=-1, keepdims=True))).mean(axis=-1)

    rs = np.random.RandomState(42)
    array1 = da.from_array(rs.randn(4, 4), chunks=(2, 4))
    array2 = da.from_array(rs.randn(4, 4), chunks=(2, 4))
    data_array_1 = xr.DataArray(array1, dims=('x', 'z'))
    data_array_2 = xr.DataArray(array2, dims=('y', 'z'))

    expected = apply_ufunc(
        covariance, data_array_1.compute(), data_array_2.compute(),
        input_core_dims=[['z'], ['z']])
    allowed = apply_ufunc(
        covariance, data_array_1, data_array_2, input_core_dims=[['z'], ['z']],
        dask='allowed')
    assert isinstance(allowed.data, da.Array)
    xr.testing.assert_allclose(expected, allowed.compute())

    parallelized = apply_ufunc(
        covariance, data_array_1, data_array_2, input_core_dims=[['z'], ['z']],
        dask='parallelized', output_dtypes=[float])
    assert isinstance(parallelized.data, da.Array)
    xr.testing.assert_allclose(expected, parallelized.compute())
Exemple #4
0
def test_solve(shape, chunk):
    np.random.seed(1)

    A = np.random.random_integers(1, 10, (shape, shape))
    dA = da.from_array(A, (chunk, chunk))

    # vector
    b = np.random.random_integers(1, 10, shape)
    db = da.from_array(b, chunk)

    res = da.linalg.solve(dA, db)
    assert_eq(res, scipy.linalg.solve(A, b))
    assert_eq(dA.dot(res), b.astype(float))

    # tall-and-skinny matrix
    b = np.random.random_integers(1, 10, (shape, 5))
    db = da.from_array(b, (chunk, 5))

    res = da.linalg.solve(dA, db)
    assert_eq(res, scipy.linalg.solve(A, b))
    assert_eq(dA.dot(res), b.astype(float))

    # matrix
    b = np.random.random_integers(1, 10, (shape, shape))
    db = da.from_array(b, (chunk, chunk))

    res = da.linalg.solve(dA, db)
    assert_eq(res, scipy.linalg.solve(A, b))
    assert_eq(dA.dot(res), b.astype(float))
Exemple #5
0
def test_lu_1():
    A1 = np.array([[7, 3, -1, 2], [3, 8, 1, -4],
                  [-1, 1, 4, -1], [2, -4, -1, 6] ])

    A2 = np.array([[7,  0,  0,  0,  0,  0],
                   [0,  8,  0,  0,  0,  0],
                   [0,  0,  4,  0,  0,  0],
                   [0,  0,  0,  6,  0,  0],
                   [0,  0,  0,  0,  3,  0],
                   [0,  0,  0,  0,  0,  5]])
    # without shuffle
    for A, chunk in zip([A1, A2], [2, 2]):
        dA = da.from_array(A, chunks=(chunk, chunk))
        p, l, u = scipy.linalg.lu(A)
        dp, dl, du = da.linalg.lu(dA)
        assert_eq(p, dp)
        assert_eq(l, dl)
        assert_eq(u, du)
        _check_lu_result(dp, dl, du, A)

    A3 = np.array([[ 7,  3,  2,  1,  4,  1],
                   [ 7, 11,  5,  2,  5,  2],
                   [21, 25, 16, 10, 16,  5],
                   [21, 41, 18, 13, 16, 11],
                   [14, 46, 23, 24, 21, 22],
                   [ 0, 56, 29, 17, 14, 8]])

    # with shuffle
    for A, chunk in zip([A3], [2]):
        dA = da.from_array(A, chunks=(chunk, chunk))
        p, l, u = scipy.linalg.lu(A)
        dp, dl, du = da.linalg.lu(dA)
        _check_lu_result(dp, dl, du, A)
Exemple #6
0
def test_tsqr_zero_height_chunks():
    m_q = 10
    n_q = 5
    m_r = 5
    n_r = 5

    # certainty
    mat = np.random.rand(10, 5)
    x = da.from_array(mat, chunks=((4, 0, 1, 0, 5), (5,)))
    q, r = da.linalg.qr(x)
    assert_eq((m_q, n_q), q.shape)  # shape check
    assert_eq((m_r, n_r), r.shape)  # shape check
    assert_eq(mat, da.dot(q, r))  # accuracy check
    assert_eq(np.eye(n_q, n_q), da.dot(q.T, q))  # q must be orthonormal
    assert_eq(r, da.triu(r.rechunk(r.shape[0])))  # r must be upper triangular

    # uncertainty
    mat2 = np.vstack([mat, -np.ones((10, 5))])
    v2 = mat2[:, 0]
    x2 = da.from_array(mat2, chunks=5)
    c = da.from_array(v2, chunks=5)
    x = x2[c >= 0, :]  # remove the ones added above to yield mat
    q, r = da.linalg.qr(x)
    q = q.compute()  # because uncertainty
    r = r.compute()
    assert_eq((m_q, n_q), q.shape)  # shape check
    assert_eq((m_r, n_r), r.shape)  # shape check
    assert_eq(mat, np.dot(q, r))  # accuracy check
    assert_eq(np.eye(n_q, n_q), np.dot(q.T, q))  # q must be orthonormal
    assert_eq(r, np.triu(r))  # r must be upper triangular
Exemple #7
0
def get_reflectance_lut(filename):
    """Read the LUT with reflectances as a function of wavelength, satellite
    zenith secant, azimuth difference angle, and sun zenith secant

    """

    h5f = h5py.File(filename, 'r')

    tab = h5f['reflectance']
    wvl = h5f['wavelengths']
    azidiff = h5f['azimuth_difference']
    satellite_zenith_secant = h5f['satellite_zenith_secant']
    sun_zenith_secant = h5f['sun_zenith_secant']

    if HAVE_DASK:
        tab = from_array(tab, chunks=(10, 10, 10, 10))
        wvl = wvl[:]  # no benefit to dask-ifying this
        azidiff = from_array(azidiff, chunks=(1000,))
        satellite_zenith_secant = from_array(satellite_zenith_secant,
                                             chunks=(1000,))
        sun_zenith_secant = from_array(sun_zenith_secant,
                                       chunks=(1000,))
    else:
        # load all of the data we are going to use in to memory
        tab = tab[:]
        wvl = wvl[:]
        azidiff = azidiff[:]
        satellite_zenith_secant = satellite_zenith_secant[:]
        sun_zenith_secant = sun_zenith_secant[:]
        h5f.close()

    return tab, wvl, azidiff, satellite_zenith_secant, sun_zenith_secant
Exemple #8
0
def test_solve_sym_pos(shape, chunk):
    np.random.seed(1)

    A = _get_symmat(shape)
    dA = da.from_array(A, (chunk, chunk))

    # vector
    b = np.random.randint(1, 10, shape)
    db = da.from_array(b, chunk)

    res = da.linalg.solve(dA, db, sym_pos=True)
    assert_eq(res, scipy.linalg.solve(A, b, sym_pos=True))
    assert_eq(dA.dot(res), b.astype(float))

    # tall-and-skinny matrix
    b = np.random.randint(1, 10, (shape, 5))
    db = da.from_array(b, (chunk, 5))

    res = da.linalg.solve(dA, db, sym_pos=True)
    assert_eq(res, scipy.linalg.solve(A, b, sym_pos=True))
    assert_eq(dA.dot(res), b.astype(float))

    # matrix
    b = np.random.randint(1, 10, (shape, shape))
    db = da.from_array(b, (chunk, chunk))

    res = da.linalg.solve(dA, db, sym_pos=True)
    assert_eq(res, scipy.linalg.solve(A, b, sym_pos=True))
    assert_eq(dA.dot(res), b.astype(float))
Exemple #9
0
def test_insert():
    x = np.random.randint(10, size=(10, 10))
    a = da.from_array(x, chunks=(5, 5))
    y = np.random.randint(10, size=(5, 10))
    b = da.from_array(y, chunks=(4, 4))

    assert_eq(np.insert(x, 0, -1, axis=0), da.insert(a, 0, -1, axis=0))
    assert_eq(np.insert(x, 3, -1, axis=-1), da.insert(a, 3, -1, axis=-1))
    assert_eq(np.insert(x, 5, -1, axis=1), da.insert(a, 5, -1, axis=1))
    assert_eq(np.insert(x, -1, -1, axis=-2), da.insert(a, -1, -1, axis=-2))
    assert_eq(np.insert(x, [2, 3, 3], -1, axis=1),
              da.insert(a, [2, 3, 3], -1, axis=1))
    assert_eq(np.insert(x, [2, 3, 8, 8, -2, -2], -1, axis=0),
              da.insert(a, [2, 3, 8, 8, -2, -2], -1, axis=0))
    assert_eq(np.insert(x, slice(1, 4), -1, axis=1),
              da.insert(a, slice(1, 4), -1, axis=1))
    assert_eq(np.insert(x, [2] * 3 + [5] * 2, y, axis=0),
              da.insert(a, [2] * 3 + [5] * 2, b, axis=0))
    assert_eq(np.insert(x, 0, y[0], axis=1),
              da.insert(a, 0, b[0], axis=1))

    assert same_keys(da.insert(a, [2, 3, 8, 8, -2, -2], -1, axis=0),
                     da.insert(a, [2, 3, 8, 8, -2, -2], -1, axis=0))

    with pytest.raises(NotImplementedError):
        da.insert(a, [4, 2], -1, axis=0)

    with pytest.raises(IndexError):
        da.insert(a, [3], -1, axis=2)

    with pytest.raises(IndexError):
        da.insert(a, [3], -1, axis=-3)
Exemple #10
0
def test_apply_gufunc_elemwise_01():
    def add(x, y):
        return x + y
    a = da.from_array(np.array([1, 2, 3]), chunks=2, name='a')
    b = da.from_array(np.array([1, 2, 3]), chunks=2, name='b')
    z = apply_gufunc(add, "(),()->()", a, b, output_dtypes=a.dtype)
    assert_eq(z, np.array([2, 4, 6]))
Exemple #11
0
def test_isclose():
    x = np.array([0, np.nan, 1, 1.5])
    y = np.array([1e-9, np.nan, 1, 2])
    a = da.from_array(x, chunks=(2,))
    b = da.from_array(y, chunks=(2,))
    assert_eq(da.isclose(a, b, equal_nan=True),
              np.isclose(x, y, equal_nan=True))
Exemple #12
0
def test_dot_method():
    x = np.arange(400).reshape((20, 20))
    a = da.from_array(x, chunks=(5, 5))
    y = np.arange(200).reshape((20, 10))
    b = da.from_array(y, chunks=(5, 5))

    assert_eq(a.dot(b), x.dot(y))
Exemple #13
0
def test_tree_reduce_depth():
    # 2D
    x = da.from_array(np.arange(242).reshape((11, 22)), chunks=(3, 4))
    thresh = {0: 2, 1: 3}
    assert_max_deps(x.sum(split_every=thresh), 2 * 3)
    assert_max_deps(x.sum(axis=0, split_every=thresh), 2)
    assert_max_deps(x.sum(axis=1, split_every=thresh), 3)
    assert_max_deps(x.sum(split_every=20), 20, False)
    assert_max_deps(x.sum(axis=0, split_every=20), 4)
    assert_max_deps(x.sum(axis=1, split_every=20), 6)

    # 3D
    x = da.from_array(np.arange(11 * 22 * 29).reshape((11, 22, 29)), chunks=(3, 4, 5))
    thresh = {0: 2, 1: 3, 2: 4}
    assert_max_deps(x.sum(split_every=thresh), 2 * 3 * 4)
    assert_max_deps(x.sum(axis=0, split_every=thresh), 2)
    assert_max_deps(x.sum(axis=1, split_every=thresh), 3)
    assert_max_deps(x.sum(axis=2, split_every=thresh), 4)
    assert_max_deps(x.sum(axis=(0, 1), split_every=thresh), 2 * 3)
    assert_max_deps(x.sum(axis=(0, 2), split_every=thresh), 2 * 4)
    assert_max_deps(x.sum(axis=(1, 2), split_every=thresh), 3 * 4)
    assert_max_deps(x.sum(split_every=20), 20, False)
    assert_max_deps(x.sum(axis=0, split_every=20), 4)
    assert_max_deps(x.sum(axis=1, split_every=20), 6)
    assert_max_deps(x.sum(axis=2, split_every=20), 6)
    assert_max_deps(x.sum(axis=(0, 1), split_every=20), 20, False)
    assert_max_deps(x.sum(axis=(0, 2), split_every=20), 20, False)
    assert_max_deps(x.sum(axis=(1, 2), split_every=20), 20, False)
    assert_max_deps(x.sum(axis=(0, 1), split_every=40), 4 * 6)
    assert_max_deps(x.sum(axis=(0, 2), split_every=40), 4 * 6)
    assert_max_deps(x.sum(axis=(1, 2), split_every=40), 6 * 6)
Exemple #14
0
    def get_dataset(self, key, info):

        if self.reader is None:

            with open(self.filename) as fdes:
                data = fdes.read(3)
            if data in ["CMS", "NSS", "UKM", "DSS"]:
                reader = GACKLMReader
                self.chn_dict = AVHRR3_CHANNEL_NAMES
            else:
                reader = GACPODReader
                self.chn_dict = AVHRR_CHANNEL_NAMES

            self.reader = reader()
            self.reader.read(self.filename)

        if key.name in ['latitude', 'longitude']:
            if self.reader.lons is None or self.reader.lats is None:
                #self.reader.get_lonlat(clock_drift_adjust=False)
                self.reader.get_lonlat()
            if key.name == 'latitude':
                return xr.DataArray(da.from_array(self.reader.lats, chunks=1000),
                                    dims=['y', 'x'], attrs=info)
            else:
                return xr.DataArray(da.from_array(self.reader.lons, chunks=1000),
                                    dims=['y', 'x'], attrs=info)

        if self.channels is None:
            self.channels = self.reader.get_calibrated_channels()

        data = self.channels[:, :, self.chn_dict[key.name]]
        return xr.DataArray(da.from_array(data, chunks=1000),
                            dims=['y', 'x'], attrs=info)
Exemple #15
0
def test_complex(ufunc):

    dafunc = getattr(da, ufunc)
    npfunc = getattr(np, ufunc)

    real = np.random.randint(1, 100, size=(20, 20))
    imag = np.random.randint(1, 100, size=(20, 20)) * 1j
    comp = real + imag

    dareal = da.from_array(real, 3)
    daimag = da.from_array(imag, 3)
    dacomp = da.from_array(comp, 3)

    assert_eq(dacomp.real, comp.real)
    assert_eq(dacomp.imag, comp.imag)
    assert_eq(dacomp.conj(), comp.conj())

    for darr, arr in [(dacomp, comp), (dareal, real), (daimag, imag)]:
        # applying Dask ufunc doesn't trigger computation
        assert isinstance(dafunc(darr), da.Array)
        assert_eq(dafunc(darr), npfunc(arr))

        assert_eq(npfunc(darr), npfunc(arr))

        # applying Dask ufunc to normal ndarray triggers computation
        assert isinstance(dafunc(arr), np.ndarray)
        assert_eq(dafunc(arr), npfunc(arr))
Exemple #16
0
def coords_all_dtypes_and_lazynesses(self, coord_class):
    # Generate coords with all possible types of points and bounds, and all
    # of the given dtypes.
    points_types = ['real', 'lazy']
    bounds_types = ['no', 'real', 'lazy']
    # Test a few specific combinations of points+bounds dtypes, including
    # cases where they are different.
    dtype_pairs = [(np.float64, np.float64),
                   (np.int16, np.int16),
                   (np.int16, np.float32),
                   (np.float64, np.int32)]
    for pts_dtype, bds_dtype in dtype_pairs:
        for points_type_name in points_types:
            for bounds_type_name in bounds_types:
                pts = np.asarray(self.pts_real, dtype=pts_dtype)
                bds = np.asarray(self.bds_real, dtype=bds_dtype)
                if points_type_name == 'lazy':
                    pts = da.from_array(pts, pts.shape)
                if bounds_type_name == 'lazy':
                    bds = da.from_array(bds, bds.shape)
                elif bounds_type_name == 'no':
                    bds = None
                coord = coord_class(pts, bounds=bds)
                result = (coord, points_type_name, bounds_type_name)
                yield result
Exemple #17
0
def test_index_with_int_dask_array_0d(chunks):
    # Slice by 0-dimensional array
    x = da.from_array([[10, 20, 30],
                       [40, 50, 60]], chunks=chunks)
    idx0 = da.from_array(1, chunks=1)
    assert_eq(x[idx0, :], x[1, :])
    assert_eq(x[:, idx0], x[:, 1])
Exemple #18
0
def test_apply_gufunc_elemwise_01b():
    def add(x, y):
        return x + y
    a = da.from_array(np.array([1, 2, 3]), chunks=2, name='a')
    b = da.from_array(np.array([1, 2, 3]), chunks=1, name='b')
    with pytest.raises(ValueError):
        apply_gufunc(add, "(),()->()", a, b, output_dtypes=a.dtype)
Exemple #19
0
def test_lstsq(nrow, ncol, chunk):
    np.random.seed(1)
    A = np.random.randint(1, 20, (nrow, ncol))
    b = np.random.randint(1, 20, nrow)

    dA = da.from_array(A, (chunk, ncol))
    db = da.from_array(b, chunk)

    x, r, rank, s = np.linalg.lstsq(A, b)
    dx, dr, drank, ds = da.linalg.lstsq(dA, db)

    assert_eq(dx, x)
    assert_eq(dr, r)
    assert drank.compute() == rank
    assert_eq(ds, s)

    # reduce rank causes multicollinearity, only compare rank
    A[:, 1] = A[:, 2]
    dA = da.from_array(A, (chunk, ncol))
    db = da.from_array(b, chunk)
    x, r, rank, s = np.linalg.lstsq(A, b,
                                    rcond=np.finfo(np.double).eps * max(nrow,
                                                                        ncol))
    assert rank == ncol - 1
    dx, dr, drank, ds = da.linalg.lstsq(dA, db)
    assert drank.compute() == rank
Exemple #20
0
def test_lstsq(nrow, ncol, chunk):
    import scipy.linalg
    np.random.seed(1)
    A = np.random.random_integers(1, 20, (nrow, ncol))
    b = np.random.random_integers(1, 20, nrow)

    dA = da.from_array(A, (chunk, ncol))
    db = da.from_array(b, chunk)

    x, r, rank, s = np.linalg.lstsq(A, b)
    dx, dr, drank, ds = da.linalg.lstsq(dA, db)

    assert_eq(dx, x)
    assert_eq(dr, r)
    assert drank.compute() == rank
    assert_eq(ds, s)

    # reduce rank causes multicollinearity, only compare rank
    A[:, 1] = A[:, 2]
    dA = da.from_array(A, (chunk, ncol))
    db = da.from_array(b, chunk)
    x, r, rank, s = np.linalg.lstsq(A, b)
    assert rank == ncol - 1
    dx, dr, drank, ds = da.linalg.lstsq(dA, db)
    assert drank.compute() == rank
Exemple #21
0
def test_index_with_int_dask_array_indexerror(chunks):
    a = da.arange(4, chunks=chunks)
    idx = da.from_array([4], chunks=1)
    with pytest.raises(IndexError):
        a[idx].compute()
    idx = da.from_array([-5], chunks=1)
    with pytest.raises(IndexError):
        a[idx].compute()
Exemple #22
0
def test_elemwise_consistent_names():
    a = da.from_array(np.arange(5, dtype='f4'), chunks=(2,))
    b = da.from_array(np.arange(5, dtype='f4'), chunks=(2,))
    assert same_keys(a + b, a + b)
    assert same_keys(a + 2, a + 2)
    assert same_keys(da.exp(a), da.exp(a))
    assert same_keys(da.exp(a, dtype='f8'), da.exp(a, dtype='f8'))
    assert same_keys(da.maximum(a, b), da.maximum(a, b))
Exemple #23
0
def test_tril_triu_errors():
    A = np.random.randint(0, 11, (10, 10, 10))
    dA = da.from_array(A, chunks=(5, 5, 5))
    pytest.raises(ValueError, lambda: da.triu(dA))

    A = np.random.randint(0, 11, (30, 35))
    dA = da.from_array(A, chunks=(5, 5))
    pytest.raises(NotImplementedError, lambda: da.triu(dA))
Exemple #24
0
def test_index_with_dask_array():
    x = np.arange(36).reshape((6, 6))
    d = da.from_array(x, chunks=(3, 3))
    ind = np.asarray([True, True, False, True, False, False], dtype=bool)
    ind = da.from_array(ind, chunks=2)
    for index in [ind, (slice(1, 9, 2), ind), (ind, slice(2, 8, 1))]:
        x_index = dask.compute(index)[0]
        assert_eq(x[x_index], d[index])
Exemple #25
0
def test_bincount_with_weights():
    x = np.array([2, 1, 5, 2, 1])
    d = da.from_array(x, chunks=2)
    weights = np.array([1, 2, 1, 0.5, 1])

    dweights = da.from_array(weights, chunks=2)
    assert eq(da.bincount(d, weights=dweights, minlength=6),
              np.bincount(x, weights=dweights, minlength=6))
Exemple #26
0
def test_apply_gufunc_elemwise_02():
    def addmul(x, y):
        assert x.shape in ((2,), (1,))
        return x + y, x * y
    a = da.from_array(np.array([1, 2, 3]), chunks=2, name='a')
    b = da.from_array(np.array([1, 2, 3]), chunks=2, name='b')
    z1, z2 = apply_gufunc(addmul, "(),()->(),()", a, b, output_dtypes=2 * (a.dtype,))
    assert_eq(z1, np.array([2, 4, 6]))
    assert_eq(z2, np.array([1, 4, 9]))
Exemple #27
0
def test_bincount_with_weights():
    x = np.array([2, 1, 5, 2, 1])
    d = da.from_array(x, chunks=2)
    weights = np.array([1, 2, 1, 0.5, 1])

    dweights = da.from_array(weights, chunks=2)
    e = da.bincount(d, weights=dweights, minlength=6)
    assert_eq(e, np.bincount(x, weights=dweights.compute(), minlength=6))
    assert same_keys(da.bincount(d, weights=dweights, minlength=6), e)
Exemple #28
0
 def test_where_dispatching(self):
     a = np.arange(10)
     b = a > 3
     x = da.from_array(a, 5)
     y = da.from_array(b, 5)
     expected = DataArray(a).where(b)
     self.assertLazyAndIdentical(expected, DataArray(a).where(y))
     self.assertLazyAndIdentical(expected, DataArray(x).where(b))
     self.assertLazyAndIdentical(expected, DataArray(x).where(y))
Exemple #29
0
    def get_dataset(self, key, info, out=None, xslice=None, yslice=None):
        """Get the dataset designated by *key*."""
        if key.name in ['solar_zenith_angle', 'solar_azimuth_angle',
                        'satellite_zenith_angle', 'satellite_azimuth_angle']:

            if key.name == 'solar_zenith_angle':
                var = self.sd.select('SolarZenith')
            if key.name == 'solar_azimuth_angle':
                var = self.sd.select('SolarAzimuth')
            if key.name == 'satellite_zenith_angle':
                var = self.sd.select('SensorZenith')
            if key.name == 'satellite_azimuth_angle':
                var = self.sd.select('SensorAzimuth')

            data = xr.DataArray(from_sds(var, chunks=CHUNK_SIZE),
                                dims=['y', 'x']).astype(np.float32)
            data = data.where(data != var._FillValue)
            data = data * np.float32(var.scale_factor)

            data.attrs = info
            return data

        if key.name not in ['longitude', 'latitude']:
            return

        if (self.cache[key.resolution]['lons'] is None or
                self.cache[key.resolution]['lats'] is None):

            lons_id = DatasetID('longitude',
                                resolution=key.resolution)
            lats_id = DatasetID('latitude',
                                resolution=key.resolution)

            lons, lats = self.load(
                [lons_id, lats_id], interpolate=False, raw=True)
            if key.resolution != self.resolution:
                from geotiepoints.geointerpolator import GeoInterpolator
                lons, lats = self._interpolate([lons, lats],
                                               self.resolution,
                                               lons_id.resolution,
                                               GeoInterpolator)
                lons = np.ma.masked_invalid(np.ascontiguousarray(lons))
                lats = np.ma.masked_invalid(np.ascontiguousarray(lats))
            self.cache[key.resolution]['lons'] = lons
            self.cache[key.resolution]['lats'] = lats

        if key.name == 'latitude':
            data = self.cache[key.resolution]['lats'].filled(np.nan)
            data = xr.DataArray(da.from_array(data, chunks=(CHUNK_SIZE, CHUNK_SIZE)),
                                dims=['y', 'x'])
        else:
            data = self.cache[key.resolution]['lons'].filled(np.nan)
            data = xr.DataArray(da.from_array(data, chunks=(CHUNK_SIZE,
                                                            CHUNK_SIZE)),
                                dims=['y', 'x'])
        data.attrs = info
        return data
Exemple #30
0
def test_choice():
    np_dtype = np.random.choice(1, size=()).dtype
    size = (10, 3)
    chunks = 4
    x = da.random.choice(3, size=size, chunks=chunks)
    assert x.dtype == np_dtype
    assert x.shape == size
    res = x.compute()
    assert res.dtype == np_dtype
    assert res.shape == size

    np_a = np.array([1, 3, 5, 7, 9], dtype='f8')
    da_a = da.from_array(np_a, chunks=2)

    for a in [np_a, da_a]:
        x = da.random.choice(a, size=size, chunks=chunks)
        res = x.compute()
        assert x.dtype == np_a.dtype
        assert res.dtype == np_a.dtype
        assert set(np.unique(res)).issubset(np_a)

    np_p = np.array([0, 0.2, 0.2, 0.3, 0.3])
    da_p = da.from_array(np_p, chunks=2)

    for a, p in [(da_a, np_p), (np_a, da_p)]:
        x = da.random.choice(a, size=size, chunks=chunks, p=p)
        res = x.compute()
        assert x.dtype == np_a.dtype
        assert res.dtype == np_a.dtype
        assert set(np.unique(res)).issubset(np_a[1:])

    np_dtype = np.random.choice(1, size=(), p=np.array([1])).dtype
    x = da.random.choice(5, size=size, chunks=chunks, p=np_p)
    res = x.compute()
    assert x.dtype == np_dtype
    assert res.dtype == np_dtype

    errs = [(-1, None),             # negative a
            (np_a[:, None], None),  # a must be 1D
            (np_a, np_p[:, None]),  # p must be 1D
            (np_a, np_p[:-2]),      # a and p must match
            (3, np_p),              # a and p must match
            (4, [0.2, 0.2, 0.3])]   # p must sum to 1

    for (a, p) in errs:
        with pytest.raises(ValueError):
            da.random.choice(a, size=size, chunks=chunks, p=p)

    with pytest.raises(NotImplementedError):
        da.random.choice(da_a, size=size, chunks=chunks, replace=False)

    # Want to make sure replace=False works for a single-partition output array
    x = da.random.choice(da_a, size=da_a.shape[0], chunks=-1, replace=False)
    res = x.compute()
    assert len(res) == len(np.unique(res))
Exemple #31
0
def test_tile_array_reps(shape, chunks, reps):
    x = np.random.random(shape)
    d = da.from_array(x, chunks=chunks)

    with pytest.raises(NotImplementedError):
        da.tile(d, reps)
Exemple #32
0
and computations of a source dask array and display the results in napari.
When the computation takes one or more parameters, one can tie a UI to
them using magicgui.
"""

import numpy as np
import napari
import dask.array as da
from dask.array.lib.stride_tricks import sliding_window_view
from skimage import data

##############################################################################
# Part 1: using code to view a specific value.

blobs = data.binary_blobs(length=64, n_dim=3)
blobs_dask = da.from_array(blobs, chunks=(1, 64, 64))

# original shape [60, 1, 1, 5, 64, 64],
# use squeeze to remove singleton axes
blobs_dask_windows = np.squeeze(
    sliding_window_view(blobs_dask, window_shape=(5, 64, 64)),
    axis=(1, 2),
)
blobs_sum = np.sum(blobs_dask_windows, axis=1)
viewer = napari.view_image(blobs_sum)

if __name__ == '__main__':
    napari.run()

##############################################################################
# Part 2: using magicgui to vary the slice thickness.
Exemple #33
0
def compute_capacity_factors(tech_points_dict: Dict[str, List[Tuple[float, float]]],
                             spatial_res: float, timestamps: pd.DatetimeIndex,
                             precision: int = 3,
                             smooth_wind_power_curve: bool = True) -> pd.DataFrame:
    """
    Compute capacity factors for a list of points associated to a list of technologies.

    Parameters
    ----------
    tech_points_dict : Dict[str, List[Tuple[float, float]]]
        Dictionary associating to each tech a list of points.
    spatial_res: float
        Spatial resolution of coordinates
    timestamps: pd.DatetimeIndex
        Time stamps for which we want capacity factors
    precision: int (default: 3)
        Indicates at which decimal capacity factors should be rounded
    smooth_wind_power_curve : boolean (default True)
        If "True", the transfer function of wind assets replicates the one of a wind farm,
        rather than one of a wind turbine.

    Returns
    -------
    cap_factor_df : pd.DataFrame
         DataFrame storing capacity factors for each technology and each point

    """

    for tech, points in tech_points_dict.items():
        assert len(points) != 0, f"Error: No points were defined for tech {tech}"

    assert len(timestamps) != 0, f"Error: No timestamps were defined."

    # Get the converters corresponding to the input technologies
    # Dictionary indicating for each technology which converter(s) to use.
    #    For each technology in the dictionary:
    #        - if it is pv-based, the name of the converter must be specified as a string
    #        - if it is wind, a dictionary must be defined associated for the four wind regimes
    #        defined below (I, II, III, IV), the name of the converter as a string
    converters_dict = get_config_dict(list(tech_points_dict.keys()), ["converter"])

    vres_profiles_dir = f"{data_path}generation/vres/profiles/source/"
    transfer_function_dir = f"{vres_profiles_dir}transfer_functions/"
    data_converter_wind = pd.read_csv(f"{transfer_function_dir}data_wind_turbines.csv", sep=';', index_col=0)
    data_converter_pv = pd.read_csv(f"{transfer_function_dir}data_pv_modules.csv", sep=';', index_col=0)

    dataset = read_resource_database(spatial_res).sel(time=timestamps)

    # Create output dataframe with MultiIndex (tech, coords)
    tech_points_tuples = sorted([(tech, point[0], point[1]) for tech, points in tech_points_dict.items()
                                 for point in points])
    cap_factor_df = pd.DataFrame(index=timestamps,
                                 columns=pd.MultiIndex.from_tuples(tech_points_tuples,
                                                                   names=['technologies', 'lon', 'lat']),
                                 dtype=float)

    for tech in tech_points_dict.keys():

        resource = get_config_values(tech, ["plant"])
        # Round points at the given resolution
        non_rounded_points = tech_points_dict[tech]
        rounded_points = [(round(point[0] / spatial_res) * spatial_res,
                           round(point[1] / spatial_res) * spatial_res)
                          for point in non_rounded_points]
        non_rounded_to_rounded_dict = dict(zip(non_rounded_points, rounded_points))
        sub_dataset = dataset.sel(locations=sorted(list(set(rounded_points))))

        if resource == 'Wind':

            wind_speed_reference_height = 100.
            roughness = sub_dataset.fsr

            # Compute wind speed for the all the coordinates
            wind = xu.sqrt(sub_dataset.u100 ** 2 + sub_dataset.v100 ** 2)

            wind_mean = wind.mean(dim='time')

            # Split according to the IEC 61400 WTG classes
            wind_classes = {'IV': [0., 6.5], 'III': [6.5, 8.], 'II': [8., 9.5], 'I': [9.5, 99.]}
            list_df_per_wind_class = []

            for cls in wind_classes:

                filtered_wind_data = wind_mean.where((wind_mean.data >= wind_classes[cls][0]) &
                                                     (wind_mean.data < wind_classes[cls][1]), 0)
                coords_classes = filtered_wind_data[da.nonzero(filtered_wind_data)].locations.values.tolist()

                if len(coords_classes) > 0:

                    wind_filtered = wind.sel(locations=coords_classes)
                    roughness_filtered = roughness.sel(locations=coords_classes)

                    # Get the transfer function curve
                    # literal_eval converts a string to an array (in this case)
                    converter = converters_dict[tech]["converter"][cls]
                    power_curve_array = literal_eval(data_converter_wind.loc['Power curve', converter])
                    wind_speed_references = np.asarray([i[0] for i in power_curve_array])
                    capacity_factor_references = np.asarray([i[1] for i in power_curve_array])
                    capacity_factor_references_pu = capacity_factor_references / max(capacity_factor_references)

                    wind_log = windpowerlib.wind_speed.logarithmic_profile(
                        wind_filtered.values, wind_speed_reference_height,
                        float(data_converter_wind.loc['Hub height [m]', converter]),
                        roughness_filtered.values)
                    wind_data = da.from_array(wind_log, chunks='auto', asarray=True)

                    # The transfer function of wind assets replicates the one of a
                    # wind farm rather than one of a wind turbine.
                    if smooth_wind_power_curve:

                        turbulence_intensity = wind_filtered.std(dim='time') / wind_filtered.mean(dim='time')

                        capacity_factor_farm = windpowerlib.power_curves.smooth_power_curve(
                            pd.Series(wind_speed_references), pd.Series(capacity_factor_references_pu),
                            standard_deviation_method='turbulence_intensity',
                            turbulence_intensity=float(turbulence_intensity.min().values),
                            wind_speed_range=10.0)

                        power_output = da.map_blocks(np.interp, wind_data,
                                                     capacity_factor_farm['wind_speed'].values,
                                                     capacity_factor_farm['value'].values).compute()
                    else:

                        power_output = da.map_blocks(np.interp, wind_data,
                                                     wind_speed_references,
                                                     capacity_factor_references_pu).compute()

                    # Convert rounded point back into non-rounded points
                    power_output_df = pd.DataFrame(power_output, columns=coords_classes)
                    coords_classes_rounded = [non_rounded_to_rounded_dict[point] for point in non_rounded_points]
                    power_output_corrected = [power_output_df[point].values
                                              for point in coords_classes_rounded
                                              if point in power_output_df.columns]
                    coords_classes_non_rounded = [point for point in non_rounded_to_rounded_dict
                                                  if non_rounded_to_rounded_dict[point] in power_output_df.columns]
                    tech_points_tuples = [(lon, lat) for lon, lat in coords_classes_non_rounded]
                    df_per_wind_class = pd.DataFrame(np.array(power_output_corrected).T,
                                                     index=timestamps, columns=tech_points_tuples)
                    list_df_per_wind_class.append(df_per_wind_class)

                else:

                    continue

            cap_factor_df_concat = pd.concat(list_df_per_wind_class, axis=1)
            cap_factor_df[tech] = cap_factor_df_concat.reindex(sorted(cap_factor_df_concat.columns), axis=1)

        elif resource == 'PV':

            converter = converters_dict[tech]["converter"]

            # Get irradiance in W from J
            irradiance = sub_dataset.ssrd / 3600.
            # Get temperature in C from K
            temperature = sub_dataset.t2m - 273.15

            # Homer equation here:
            # https://www.homerenergy.com/products/pro/docs/latest/how_homer_calculates_the_pv_array_power_output.html
            # https://enphase.com/sites/default/files/Enphase_PVWatts_Derate_Guide_ModSolar_06-2014.pdf
            power_output = (float(data_converter_pv.loc['f', converter]) *
                            (irradiance/float(data_converter_pv.loc['G_ref', converter])) *
                            (1. + float(data_converter_pv.loc['k_P [%/C]', converter])/100. *
                             (temperature - float(data_converter_pv.loc['t_ref', converter]))))

            power_output = np.array(power_output)

            # Convert rounded point back into non rounded points
            power_output_df = pd.DataFrame(power_output, columns=sub_dataset.locations.values.tolist())
            coords_classes_rounded = [non_rounded_to_rounded_dict[point] for point in non_rounded_points]
            power_output_corrected = [power_output_df[point].values
                                      for point in coords_classes_rounded if point in power_output_df.columns]
            cap_factor_df[tech] = np.array(power_output_corrected).T

        else:
            raise ValueError(' Profiles for the specified resource is not available yet.')

    # Check that we do not have NANs
    assert cap_factor_df.isna().to_numpy().sum() == 0, "Some capacity factors are not available."

    # Decrease precision of capacity factors
    cap_factor_df = cap_factor_df.round(precision)

    return cap_factor_df
Exemple #34
0
    tmpa_hdf_file = os.path.join(cfun.tmpa_dir, 'data_tmpa_world_daily.hdf5')
else:
    print('main_evd_maps ERROR:: must specify a valid domain!')

# read dask array with all daily precipitation data
f = h5py.File(tmpa_hdf_file, "r")
print(list(f.keys()))
tmpalat = f['lat'][:]
tmpalon = f['lon'][:]
nlat = np.size(tmpalat)
nlon = np.size(tmpalon)
dates_int = f['dates'][:]
# hours_int = f['hours'][:]
dset = f['prcp']
# print('dataset shape = {}'.format(dset.shape))
x = da.from_array(dset, chunks=(6, 6, 300))
# UTC time
dates = [datetime.strptime(str(integd), '%Y%m%d') for integd in dates_int]
xconus = xr.DataArray(x,
                      coords={
                          'lon': tmpalon,
                          'lat': tmpalat,
                          'time': dates
                      },
                      dims=('lon', 'lat', 'time'))
xconus = xconus.where(xconus >= -0.001)
### end reading prcp dataset ###

# for each grid cell do the following:
ntr = np.size(TR)
Fi = 1 - 1 / TR
Exemple #35
0
def hdfgroup2signaldict(group, lazy=False):
    global current_file_version
    global default_version
    if current_file_version < LooseVersion("1.2"):
        metadata = "mapped_parameters"
        original_metadata = "original_parameters"
    else:
        metadata = "metadata"
        original_metadata = "original_metadata"

    exp = {
        'metadata': hdfgroup2dict(group[metadata], lazy=lazy),
        'original_metadata': hdfgroup2dict(group[original_metadata],
                                           lazy=lazy),
        'attributes': {}
    }

    data = group['data']
    if lazy:
        data = da.from_array(data, chunks=data.chunks)
        exp['attributes']['_lazy'] = True
    else:
        data = np.asanyarray(data)
    exp['data'] = data
    axes = []
    for i in range(len(exp['data'].shape)):
        try:
            axes.append(dict(group['axis-%i' % i].attrs))
            axis = axes[-1]
            for key, item in axis.items():
                if isinstance(item, np.bool_):
                    axis[key] = bool(item)
                else:
                    axis[key] = ensure_unicode(item)
        except KeyError:
            break
    if len(axes) != len(exp['data'].shape):  # broke from the previous loop
        try:
            axes = [
                i for k, i in sorted(
                    iter(
                        hdfgroup2dict(group['_list_' +
                                            str(len(exp['data'].shape)) +
                                            '_axes'],
                                      lazy=lazy).items()))
            ]
        except KeyError:
            raise IOError(not_valid_format)
    exp['axes'] = axes
    if 'learning_results' in group.keys():
        exp['attributes']['learning_results'] = \
            hdfgroup2dict(
                group['learning_results'],
                lazy=lazy)
    if 'peak_learning_results' in group.keys():
        exp['attributes']['peak_learning_results'] = \
            hdfgroup2dict(
                group['peak_learning_results'],
                lazy=lazy)

    # If the title was not defined on writing the Experiment is
    # then called __unnamed__. The next "if" simply sets the title
    # back to the empty string
    if "General" in exp["metadata"] and "title" in exp["metadata"]["General"]:
        if '__unnamed__' == exp['metadata']['General']['title']:
            exp['metadata']["General"]['title'] = ''

    if current_file_version < LooseVersion("1.1"):
        # Load the decomposition results written with the old name,
        # mva_results
        if 'mva_results' in group.keys():
            exp['attributes']['learning_results'] = hdfgroup2dict(
                group['mva_results'], lazy=lazy)
        if 'peak_mva_results' in group.keys():
            exp['attributes']['peak_learning_results'] = hdfgroup2dict(
                group['peak_mva_results'], lazy=lazy)
        # Replace the old signal and name keys with their current names
        if 'signal' in exp['metadata']:
            if "Signal" not in exp["metadata"]:
                exp["metadata"]["Signal"] = {}
            exp['metadata']["Signal"]['signal_type'] = \
                exp['metadata']['signal']
            del exp['metadata']['signal']

        if 'name' in exp['metadata']:
            if "General" not in exp["metadata"]:
                exp["metadata"]["General"] = {}
            exp['metadata']['General']['title'] = \
                exp['metadata']['name']
            del exp['metadata']['name']

    if current_file_version < LooseVersion("1.2"):
        if '_internal_parameters' in exp['metadata']:
            exp['metadata']['_HyperSpy'] = \
                exp['metadata']['_internal_parameters']
            del exp['metadata']['_internal_parameters']
            if 'stacking_history' in exp['metadata']['_HyperSpy']:
                exp['metadata']['_HyperSpy']["Stacking_history"] = \
                    exp['metadata']['_HyperSpy']['stacking_history']
                del exp['metadata']['_HyperSpy']["stacking_history"]
            if 'folding' in exp['metadata']['_HyperSpy']:
                exp['metadata']['_HyperSpy']["Folding"] = \
                    exp['metadata']['_HyperSpy']['folding']
                del exp['metadata']['_HyperSpy']["folding"]
        if 'Variance_estimation' in exp['metadata']:
            if "Noise_properties" not in exp["metadata"]:
                exp["metadata"]["Noise_properties"] = {}
            exp['metadata']['Noise_properties']["Variance_linear_model"] = \
                exp['metadata']['Variance_estimation']
            del exp['metadata']['Variance_estimation']
        if "TEM" in exp["metadata"]:
            if "Acquisition_instrument" not in exp["metadata"]:
                exp["metadata"]["Acquisition_instrument"] = {}
            exp["metadata"]["Acquisition_instrument"]["TEM"] = \
                exp["metadata"]["TEM"]
            del exp["metadata"]["TEM"]
            tem = exp["metadata"]["Acquisition_instrument"]["TEM"]
            if "EELS" in tem:
                if "dwell_time" in tem:
                    tem["EELS"]["dwell_time"] = tem["dwell_time"]
                    del tem["dwell_time"]
                if "dwell_time_units" in tem:
                    tem["EELS"]["dwell_time_units"] = tem["dwell_time_units"]
                    del tem["dwell_time_units"]
                if "exposure" in tem:
                    tem["EELS"]["exposure"] = tem["exposure"]
                    del tem["exposure"]
                if "exposure_units" in tem:
                    tem["EELS"]["exposure_units"] = tem["exposure_units"]
                    del tem["exposure_units"]
                if "Detector" not in tem:
                    tem["Detector"] = {}
                tem["Detector"] = tem["EELS"]
                del tem["EELS"]
            if "EDS" in tem:
                if "Detector" not in tem:
                    tem["Detector"] = {}
                if "EDS" not in tem["Detector"]:
                    tem["Detector"]["EDS"] = {}
                tem["Detector"]["EDS"] = tem["EDS"]
                del tem["EDS"]
            del tem
        if "SEM" in exp["metadata"]:
            if "Acquisition_instrument" not in exp["metadata"]:
                exp["metadata"]["Acquisition_instrument"] = {}
            exp["metadata"]["Acquisition_instrument"]["SEM"] = \
                exp["metadata"]["SEM"]
            del exp["metadata"]["SEM"]
            sem = exp["metadata"]["Acquisition_instrument"]["SEM"]
            if "EDS" in sem:
                if "Detector" not in sem:
                    sem["Detector"] = {}
                if "EDS" not in sem["Detector"]:
                    sem["Detector"]["EDS"] = {}
                sem["Detector"]["EDS"] = sem["EDS"]
                del sem["EDS"]
            del sem

        if "Sample" in exp["metadata"] and "Xray_lines" in exp["metadata"][
                "Sample"]:
            exp["metadata"]["Sample"]["xray_lines"] = exp["metadata"][
                "Sample"]["Xray_lines"]
            del exp["metadata"]["Sample"]["Xray_lines"]

        for key in ["title", "date", "time", "original_filename"]:
            if key in exp["metadata"]:
                if "General" not in exp["metadata"]:
                    exp["metadata"]["General"] = {}
                exp["metadata"]["General"][key] = exp["metadata"][key]
                del exp["metadata"][key]
        for key in ["record_by", "signal_origin", "signal_type"]:
            if key in exp["metadata"]:
                if "Signal" not in exp["metadata"]:
                    exp["metadata"]["Signal"] = {}
                exp["metadata"]["Signal"][key] = exp["metadata"][key]
                del exp["metadata"][key]

    if current_file_version < LooseVersion("3.0"):
        if "Acquisition_instrument" in exp["metadata"]:
            # Move tilt_stage to Stage.tilt_a
            # Move exposure time to Detector.Camera.exposure_time
            if "TEM" in exp["metadata"]["Acquisition_instrument"]:
                tem = exp["metadata"]["Acquisition_instrument"]["TEM"]
                exposure = None
                if "tilt_stage" in tem:
                    tem["Stage"] = {"tilt_a": tem["tilt_stage"]}
                    del tem["tilt_stage"]
                if "exposure" in tem:
                    exposure = "exposure"
                # Digital_micrograph plugin was parsing to 'exposure_time'
                # instead of 'exposure': need this to be compatible with
                # previous behaviour
                if "exposure_time" in tem:
                    exposure = "exposure_time"
                if exposure is not None:
                    if "Detector" not in tem:
                        tem["Detector"] = {
                            "Camera": {
                                "exposure": tem[exposure]
                            }
                        }
                    tem["Detector"]["Camera"] = {"exposure": tem[exposure]}
                    del tem[exposure]
            # Move tilt_stage to Stage.tilt_a
            if "SEM" in exp["metadata"]["Acquisition_instrument"]:
                sem = exp["metadata"]["Acquisition_instrument"]["SEM"]
                if "tilt_stage" in sem:
                    sem["Stage"] = {"tilt_a": sem["tilt_stage"]}
                    del sem["tilt_stage"]

    return exp
Exemple #36
0
def test_tile(shape, chunks, reps):
    x = np.random.random(shape)
    d = da.from_array(x, chunks=chunks)

    assert_eq(np.tile(x, reps), da.tile(d, reps))
Exemple #37
0
def test_tril_triu_non_square_arrays():
    A = np.random.randint(0, 11, (30, 35))
    dA = da.from_array(A, chunks=(5, 5))
    assert_eq(da.triu(dA), np.triu(A))
    assert_eq(da.tril(dA), np.tril(A))
Exemple #38
0
def hdfgroup2dict(group, dictionary=None, lazy=False):
    if dictionary is None:
        dictionary = {}
    for key, value in group.attrs.items():
        if isinstance(value, bytes):
            value = value.decode()
        if isinstance(value, (np.string_, str)):
            if value == '_None_':
                value = None
        elif isinstance(value, np.bool_):
            value = bool(value)
        elif isinstance(value, np.ndarray) and value.dtype.char == "S":
            # Convert strings to unicode
            value = value.astype("U")
            if value.dtype.str.endswith("U1"):
                value = value.tolist()
        # skip signals - these are handled below.
        if key.startswith('_sig_'):
            pass
        elif key.startswith('_list_empty_'):
            dictionary[key[len('_list_empty_'):]] = []
        elif key.startswith('_tuple_empty_'):
            dictionary[key[len('_tuple_empty_'):]] = ()
        elif key.startswith('_bs_'):
            dictionary[key[len('_bs_'):]] = value.tostring()
        # The following two elif stataments enable reading date and time from
        # v < 2 of HyperSpy's metadata specifications
        elif key.startswith('_datetime_date'):
            date_iso = datetime.date(
                *ast.literal_eval(value[value.index("("):])).isoformat()
            dictionary[key.replace("_datetime_", "")] = date_iso
        elif key.startswith('_datetime_time'):
            date_iso = datetime.time(
                *ast.literal_eval(value[value.index("("):])).isoformat()
            dictionary[key.replace("_datetime_", "")] = date_iso
        else:
            dictionary[key] = value
    if not isinstance(group, h5py.Dataset):
        for key in group.keys():
            if key.startswith('_sig_'):
                from hyperspy.io import dict2signal
                dictionary[key[len('_sig_'):]] = (dict2signal(
                    hdfgroup2signaldict(group[key], lazy=lazy)))
            elif isinstance(group[key], h5py.Dataset):
                dat = group[key]
                kn = key
                if key.startswith("_list_"):
                    ans = np.array(dat)
                    ans = ans.tolist()
                    kn = key[6:]
                elif key.startswith("_tuple_"):
                    ans = np.array(dat)
                    ans = tuple(ans.tolist())
                    kn = key[7:]
                elif dat.dtype.char == "S":
                    ans = np.array(dat)
                    try:
                        ans = ans.astype("U")
                    except UnicodeDecodeError:
                        # There are some strings that must stay in binary,
                        # for example dill pickles. This will obviously also
                        # let "wrong" binary string fail somewhere else...
                        pass
                elif lazy:
                    ans = da.from_array(dat, chunks=dat.chunks)
                else:
                    ans = np.array(dat)
                dictionary[kn] = ans
            elif key.startswith('_hspy_AxesManager_'):
                dictionary[key[len('_hspy_AxesManager_'):]] = AxesManager([
                    i for k, i in sorted(
                        iter(hdfgroup2dict(group[key], lazy=lazy).items()))
                ])
            elif key.startswith('_list_'):
                dictionary[key[7 + key[6:].find('_'):]] = \
                    [i for k, i in sorted(iter(
                        hdfgroup2dict(
                            group[key], lazy=lazy).items()
                    ))]
            elif key.startswith('_tuple_'):
                dictionary[key[8 + key[7:].find('_'):]] = tuple([
                    i for k, i in sorted(
                        iter(hdfgroup2dict(group[key], lazy=lazy).items()))
                ])
            else:
                dictionary[key] = {}
                hdfgroup2dict(group[key], dictionary[key], lazy=lazy)
    return dictionary
Exemple #39
0
def test_reductions():
    x = np.random.random((20, 20))
    a = da.from_array(x, blockshape=(7, 7))

    assert eq(a.argmin(axis=1), x.argmin(axis=1))
    assert eq(a.argmax(axis=0), x.argmax(axis=0))
Exemple #40
0
    def apply_multiplepoints(self, trav, dist=None, G0=None, nfft=None,
                             rtm=False, greens=False,
                             dottest=False, **kwargs_cgls):
        r"""Marchenko redatuming for multiple points

        Solve the Marchenko redatuming inverse problem for multiple
        points given their direct arrival traveltime curves (``trav``)
        and waveforms (``G0``).

        Parameters
        ----------
        trav : :obj:`numpy.ndarray`
            Traveltime of first arrival from subsurface points to
            surface receivers of size :math:`[n_r \times n_{vs}]`
        dist: :obj:`numpy.ndarray`, optional
            Distance between subsurface point to
            surface receivers of size :math:`[n_r \times n_{vs}]`
            (if provided the analytical direct arrival will be computed using
            a 3d formulation)
        G0 : :obj:`numpy.ndarray`, optional
            Direct arrival in time domain of size
            :math:`[n_r \times n_{vs} \times n_t]` (if None, create arrival
            using ``trav``)
        nfft : :obj:`int`, optional
            Number of samples in fft when creating the analytical direct wave
        rtm : :obj:`bool`, optional
            Compute and return rtm redatuming
        greens : :obj:`bool`, optional
            Compute and return Green's functions
        dottest : :obj:`bool`, optional
            Apply dot-test
        **kwargs_cgls
            Arbitrary keyword arguments for
            :py:func:`pylops_distributed.optimization.cg.cgls` solver

        Returns
        -------
        f1_inv_minus : :obj:`numpy.ndarray`
            Inverted upgoing focusing function of size
            :math:`[n_r \times n_{vs} \times n_t]`
        f1_inv_plus : :obj:`numpy.ndarray`
            Inverted downgoing focusing functionof size
            :math:`[n_r \times n_{vs} \times n_t]`
        p0_minus : :obj:`numpy.ndarray`
            Single-scattering standard redatuming upgoing Green's function
            of size :math:`[n_r \times n_{vs} \times n_t]`
        g_inv_minus : :obj:`numpy.ndarray`
            Inverted upgoing Green's function of size
            :math:`[n_r \times n_{vs} \times n_t]`
        g_inv_plus : :obj:`numpy.ndarray`
            Inverted downgoing Green's function of size
            :math:`[n_r \times n_{vs} \times n_t]`

        """
        nvs = trav.shape[1]

        # Create window
        trav_off = trav - self.toff
        trav_off = np.round(trav_off / self.dt).astype(np.int)

        w = np.zeros((self.nr, nvs, self.nt), dtype=self.dtype)
        for ir in range(self.nr):
            for ivs in range(nvs):
                w[ir, ivs, :trav_off[ir, ivs]] = 1
        w = np.concatenate((np.flip(w, axis=-1), w[:, :, 1:]), axis=-1)
        if self.nsmooth > 0:
            smooth = np.ones(self.nsmooth) / self.nsmooth
            w = filtfilt(smooth, 1, w)
        w = w.astype(self.dtype)

        # Create operators
        Rop = MDC(self.Rtwosided_fft, self.nt2, nv=nvs, dt=self.dt,
                  dr=self.dr, twosided=True, conj=False, saveGt=self.saveRt,
                  prescaled=self.prescaled)
        R1op = MDC(self.Rtwosided_fft, self.nt2, nv=nvs, dt=self.dt,
                   dr=self.dr, twosided=True, conj=True, saveGt=self.saveRt,
                   prescaled=self.prescaled)
        Rollop = Roll(self.ns * nvs * self.nt2,
                      dims=(self.nt2, self.ns, nvs),
                      dir=0, shift=-1, dtype=self.dtype)
        Wop = Diagonal(da.from_array(w.transpose(2, 0, 1).flatten()),
                       dtype=self.dtype)
        Iop = Identity(self.nr * nvs * self.nt2, dtype=self.dtype)
        Mop = Block([[Iop, -1 * Wop * Rop],
                     [-1 * Wop * Rollop * R1op, Iop]]) * BlockDiag([Wop, Wop])
        Gop = Block([[Iop, -1 * Rop],
                     [-1 * Rollop * R1op, Iop]])

        if dottest:
            Dottest(Gop, 2 * self.nr * nvs * self.nt2,
                    2 * self.nr * nvs * self.nt2,
                    chunks=(2 * self.ns * nvs * self.nt2,
                            2 * self.nr * nvs * self.nt2),
                    raiseerror=True, verb=True)
        if dottest:
            Dottest(Mop, 2 * self.ns * nvs * self.nt2,
                    2 * self.nr * nvs * self.nt2,
                    chunks=(2 * self.ns * nvs * self.nt2,
                            2 * self.nr * nvs * self.nt2),
                    raiseerror=True, verb=True)

        # Create input focusing function
        if G0 is None:
            if self.wav is not None and nfft is not None:
                G0 = np.zeros((self.nr, nvs, self.nt), dtype=self.dtype)
                for ivs in range(nvs):
                    G0[:, ivs] = (directwave(self.wav, trav[:, ivs],
                                             self.nt, self.dt, nfft=nfft,
                                             derivative=True,  dist=dist,
                                             kind='2d' if dist is None else '3d')).T
            else:
                logging.error('wav and/or nfft are not provided. '
                              'Provide either G0 or wav and nfft...')
                raise ValueError('wav and/or nfft are not provided. '
                                 'Provide either G0 or wav and nfft...')
            G0 = G0.astype(self.dtype)

        fd_plus = np.concatenate((np.flip(G0, axis=-1).transpose(2, 0, 1),
                                  np.zeros((self.nt - 1, self.nr, nvs),
                                           dtype=self.dtype)))
        fd_plus = da.from_array(fd_plus).rechunk(fd_plus.shape)

        # Run standard redatuming as benchmark
        if rtm:
            p0_minus = Rop * fd_plus.flatten()
            p0_minus = p0_minus.reshape(self.nt2, self.ns,
                                        nvs).transpose(1, 2, 0)

        # Create data and inverse focusing functions
        d = Wop * Rop * fd_plus.flatten()
        d = da.concatenate((d.reshape(self.nt2, self.ns, nvs),
                            da.zeros((self.nt2, self.ns, nvs),
                                     dtype=self.dtype)))

        # Invert for focusing functions
        f1_inv = cgls(Mop, d.flatten(), **kwargs_cgls)[0]
        f1_inv = f1_inv.reshape(2 * self.nt2, self.nr, nvs)
        f1_inv_tot = \
            f1_inv + da.concatenate((da.zeros((self.nt2, self.nr, nvs),
                                              dtype=self.dtype), fd_plus))
        if greens:
            # Create Green's functions
            g_inv = Gop * f1_inv_tot.flatten()
            g_inv = g_inv.reshape(2 * self.nt2, self.ns, nvs)

        # Compute
        if rtm and greens:
            d, p0_minus, f1_inv_tot, g_inv = \
                da.compute(d, p0_minus, f1_inv_tot, g_inv)
        elif rtm:
            d, p0_minus, f1_inv_tot = \
                da.compute(d, p0_minus, f1_inv_tot)
        elif greens:
            d, f1_inv_tot, g_inv = \
                da.compute(d, f1_inv_tot, g_inv)
        else:
            d, f1_inv_tot = \
                da.compute(d, f1_inv_tot)

        # Separate focusing and Green's functions
        f1_inv_minus = f1_inv_tot[:self.nt2].transpose(1, 2, 0)
        f1_inv_plus = f1_inv_tot[self.nt2:].transpose(1, 2, 0)
        if greens:
            g_inv_minus = -g_inv[:self.nt2].transpose(1, 2, 0)
            g_inv_plus = np.flip(g_inv[self.nt2:], axis=0).transpose(1, 2, 0)

        if rtm and greens:
            return f1_inv_minus, f1_inv_plus, p0_minus, g_inv_minus, g_inv_plus
        elif rtm:
            return f1_inv_minus, f1_inv_plus, p0_minus
        elif greens:
            return f1_inv_minus, f1_inv_plus, g_inv_minus, g_inv_plus
        else:
            return f1_inv_minus, f1_inv_plus
Exemple #41
0
def test_confusion_matrix_normalize(normalize, expected_results, client):
    y_test = da.from_array(cp.array([0, 1, 2] * 6))
    y_pred = da.from_array(cp.array(list(chain(*permutations([0, 1, 2])))))
    cm = confusion_matrix(y_test, y_pred, normalize=normalize)
    cp.testing.assert_allclose(cm, cp.array(expected_results))
    ## Kinematics
    #branches = ["pho_pT", "pho_E", "pho_eta", "pho_phi"]
    #X_p4 = da.concatenate([\
    #            da.from_delayed(\
    #                load_single(tree,i,i+chunk_size, branches),\
    #                shape=(chunk_size,len(branches)),\
    #                dtype=np.float32)\
    #            for i in range(0,neff,chunk_size)])
    #print " >> Expected shape:", X_p4.shape

    # Class label
    label = j
    label = 0
    print " >> Class label:", label
    y = da.from_array(\
            np.full(X.shape[0], label, dtype=np.float32),\
            chunks=(chunk_size,))

    #file_out_str = "%s/%s_IMG_RH%d_n%dk_label%d.hdf5"%(eosDir,decay,int(scale),neff//1000.,label)
    file_out_str = "%s/%s_IMGcrop_RH%d_n%dkx2.hdf5" % (
        eosDir, decay, int(scale), neff // 1000.)
    #file_out_str = "test.hdf5"
    print " >> Writing to:", file_out_str
    #da.to_hdf5(file_out_str, {'/X': X, '/y': y, 'eventId': eventId, 'X_crop0': X_crop0, 'X_crop1': X_crop1}, compression='lzf')
    da.to_hdf5(
        file_out_str,
        {
            #'/X': X,
            '/y': y,
            #'eventId': eventId,
            'X_crop0': X_crop0,
Exemple #43
0
def make_low_rank_matrix(n_samples=100,
                         n_features=100,
                         effective_rank=10,
                         tail_strength=0.5,
                         random_state=None,
                         n_parts=1,
                         n_samples_per_part=None,
                         dtype='float32'):
    """ Generate a mostly low rank matrix with bell-shaped singular values

    Parameters
    ----------
    n_samples : int, optional (default=100)
        The number of samples.
    n_features : int, optional (default=100)
        The number of features.
    effective_rank : int, optional (default=10)
        The approximate number of singular vectors required to explain most of
        the data by linear combinations.
    tail_strength : float between 0.0 and 1.0, optional (default=0.5)
        The relative importance of the fat noisy tail of the singular values
        profile.
    random_state : int, CuPy RandomState instance, Dask RandomState instance \
                   or None (default)
        Determines random number generation for dataset creation. Pass an int
        for reproducible output across multiple function calls.
    n_parts : int, optional (default=1)
        The number of parts of work.
    dtype: str, optional (default='float32')
        dtype of generated data

    Returns
    -------
    X : Dask-CuPy array of shape [n_samples, n_features]
        The matrix.

    """

    rs = _create_rs_generator(random_state)
    n = min(n_samples, n_features)

    # Random (ortho normal) vectors
    m1 = rs.standard_normal(
        (n_samples, n),
        chunks=(_generate_chunks_for_qr(n_samples, n, n_parts), -1),
        dtype=dtype)
    u, _ = da.linalg.qr(m1)

    m2 = rs.standard_normal(
        (n, n_features),
        chunks=(-1, _generate_chunks_for_qr(n_features, n, n_parts)),
        dtype=dtype)
    v, _ = da.linalg.qr(m2)

    # For final multiplication
    if n_samples_per_part is None:
        n_samples_per_part = max(1, int(n_samples / n_parts))
    u = u.rechunk({0: n_samples_per_part, 1: -1})
    v = v.rechunk({0: n_samples_per_part, 1: -1})

    local_s = _generate_singular_values(n, effective_rank, tail_strength,
                                        n_samples_per_part)
    s = da.from_array(local_s, chunks=(int(n_samples_per_part), ))

    u *= s
    return da.dot(u, v)
Exemple #44
0
def from_bcolz(x, chunksize=None, categorize=True, index=None, **kwargs):
    """ Read dask Dataframe from bcolz.ctable

    Parameters
    ----------

    x : bcolz.ctable
        Input data
    chunksize : int (optional)
        The size of blocks to pull out from ctable.  Ideally as large as can
        comfortably fit in memory
    categorize : bool (defaults to True)
        Automatically categorize all string dtypes
    index : string (optional)
        Column to make the index

    See Also
    --------

    from_array: more generic function not optimized for bcolz
    """
    import dask.array as da
    import bcolz

    if isinstance(x, (str, unicode)):
        x = bcolz.ctable(rootdir=x)
    bc_chunklen = max(x[name].chunklen for name in x.names)
    if chunksize is None and bc_chunklen > 10000:
        chunksize = bc_chunklen

    categories = dict()
    if categorize:
        for name in x.names:
            if (np.issubdtype(x.dtype[name], np.string_)
                    or np.issubdtype(x.dtype[name], np.unicode_)
                    or np.issubdtype(x.dtype[name], np.object_)):
                a = da.from_array(x[name], chunks=(chunksize * len(x.names), ))
                categories[name] = da.unique(a)

    columns = tuple(x.dtype.names)
    divisions = (0, ) + tuple(range(-1, len(x), chunksize))[1:]
    if divisions[-1] != len(x) - 1:
        divisions = divisions + (len(x) - 1, )
    if x.rootdir:
        token = tokenize((x.rootdir, os.path.getmtime(x.rootdir)), chunksize,
                         categorize, index, kwargs)
    else:
        token = tokenize((id(x), x.shape, x.dtype), chunksize, categorize,
                         index, kwargs)
    new_name = 'from_bcolz-' + token

    dsk = dict(((new_name, i), (locked_df_from_ctable, x,
                                (slice(i * chunksize, (i + 1) * chunksize), ),
                                columns, categories))
               for i in range(0, int(ceil(len(x) / chunksize))))

    result = DataFrame(dsk, new_name, columns, divisions)

    if index:
        assert index in x.names
        a = da.from_array(x[index], chunks=(chunksize * len(x.names), ))
        q = np.linspace(0, 100, len(x) // chunksize + 2)
        divisions = da.percentile(a, q).compute()
        return set_partition(result, index, divisions, **kwargs)
    else:
        return result
Exemple #45
0
def test_slicing_with_Nones(shape, slice):
    x = np.random.random(shape)
    d = da.from_array(x, chunks=shape)

    assert_eq(x[slice], d[slice])
def add_ancillary_datasets(scene,
                           lons,
                           lats,
                           sunz,
                           satz,
                           azidiff,
                           chunks=(512, 3712)):
    """Add ancillary datasets to the scene.

    Args:
        lons: Longitude coordinates
        lats: Latitude coordinates
        sunz: Solar zenith angle
        satz: Satellite zenith angle
        azidiff: Absolute azimuth difference angle
        chunks: Chunksize

    """
    start_time = scene['IR_108'].attrs['start_time']
    end_time = scene['IR_108'].attrs['end_time']
    angle_coords = scene['IR_108'].coords

    # Latitude
    scene['lat'] = xr.DataArray(da.from_array(lats, chunks=chunks),
                                dims=['y', 'x'],
                                coords={
                                    'y': scene['IR_108']['y'],
                                    'x': scene['IR_108']['x']
                                })
    scene['lat'].attrs['long_name'] = 'latitude coordinate'
    scene['lat'].attrs['standard_name'] = 'latitude'
    scene['lat'].attrs['units'] = 'degrees_north'
    scene['lat'].attrs['start_time'] = start_time
    scene['lat'].attrs['end_time'] = end_time

    # Longitude
    scene['lon'] = xr.DataArray(da.from_array(lons, chunks=chunks),
                                dims=['y', 'x'],
                                coords={
                                    'y': scene['IR_108']['y'],
                                    'x': scene['IR_108']['x']
                                })
    scene['lon'].attrs['long_name'] = 'longitude coordinate'
    scene['lon'].attrs['standard_name'] = 'longitude'
    scene['lon'].attrs['units'] = 'degrees_east'
    scene['lon'].attrs['start_time'] = start_time
    scene['lon'].attrs['end_time'] = end_time

    # Sunzenith
    scene['sunzenith'] = xr.DataArray(da.from_array(sunz[:, :], chunks=chunks),
                                      dims=['y', 'x'],
                                      coords=angle_coords)

    # Satzenith
    scene['satzenith'] = xr.DataArray(da.from_array(satz[:, :], chunks=chunks),
                                      dims=['y', 'x'],
                                      coords=angle_coords)

    # Azidiff
    scene['azimuthdiff'] = xr.DataArray(da.from_array(azidiff[:, :],
                                                      chunks=chunks),
                                        dims=['y', 'x'],
                                        coords=angle_coords)

    # Update the attributes
    update_angle_attributes(scene, band=scene['IR_108'])
Exemple #47
0
def test_tile_neg_reps(shape, chunks, reps):
    x = np.random.random(shape)
    d = da.from_array(x, chunks=chunks)

    with pytest.raises(ValueError):
        da.tile(d, reps)
Exemple #48
0
def test_slicing_consistent_names():
    x = np.arange(100).reshape((10, 10))
    a = da.from_array(x, chunks=(5, 5))
    assert same_keys(a[0], a[0])
    assert same_keys(a[:, [1, 2, 3]], a[:, [1, 2, 3]])
    assert same_keys(a[:, 5:2:-1], a[:, 5:2:-1])
Exemple #49
0
def test_diagonal():
    v = np.arange(11)
    with pytest.raises(ValueError):
        da.diagonal(v)

    v = np.arange(4).reshape((2, 2))
    with pytest.raises(ValueError):
        da.diagonal(v, axis1=0, axis2=0)

    with pytest.raises(AxisError):
        da.diagonal(v, axis1=-4)

    with pytest.raises(AxisError):
        da.diagonal(v, axis2=-4)

    v = np.arange(4 * 5 * 6).reshape((4, 5, 6))
    v = da.from_array(v, chunks=2)
    assert_eq(da.diagonal(v), np.diagonal(v))
    # Empty diagonal.
    assert_eq(da.diagonal(v, offset=10), np.diagonal(v, offset=10))
    assert_eq(da.diagonal(v, offset=-10), np.diagonal(v, offset=-10))

    with pytest.raises(ValueError):
        da.diagonal(v, axis1=-2)

    # Negative axis.
    assert_eq(da.diagonal(v, axis1=-1), np.diagonal(v, axis1=-1))
    assert_eq(da.diagonal(v, offset=1, axis1=-1),
              np.diagonal(v, offset=1, axis1=-1))

    # Heterogenous chunks.
    v = np.arange(2 * 3 * 4 * 5 * 6).reshape((2, 3, 4, 5, 6))
    v = da.from_array(v, chunks=(1, (1, 2), (1, 2, 1), (2, 1, 2), (5, 1)))

    assert_eq(da.diagonal(v), np.diagonal(v))
    assert_eq(
        da.diagonal(v, offset=2, axis1=3, axis2=1),
        np.diagonal(v, offset=2, axis1=3, axis2=1),
    )

    assert_eq(
        da.diagonal(v, offset=-2, axis1=3, axis2=1),
        np.diagonal(v, offset=-2, axis1=3, axis2=1),
    )

    assert_eq(
        da.diagonal(v, offset=-2, axis1=3, axis2=4),
        np.diagonal(v, offset=-2, axis1=3, axis2=4),
    )

    assert_eq(da.diagonal(v, 1), np.diagonal(v, 1))
    assert_eq(da.diagonal(v, -1), np.diagonal(v, -1))
    # Positional arguments
    assert_eq(da.diagonal(v, 1, 2, 1), np.diagonal(v, 1, 2, 1))

    v = np.arange(2 * 3 * 4 * 5 * 6).reshape((2, 3, 4, 5, 6))
    assert_eq(da.diagonal(v, axis1=1, axis2=3), np.diagonal(v,
                                                            axis1=1,
                                                            axis2=3))
    assert_eq(
        da.diagonal(v, offset=1, axis1=1, axis2=3),
        np.diagonal(v, offset=1, axis1=1, axis2=3),
    )

    assert_eq(
        da.diagonal(v, offset=1, axis1=3, axis2=1),
        np.diagonal(v, offset=1, axis1=3, axis2=1),
    )

    assert_eq(
        da.diagonal(v, offset=-5, axis1=3, axis2=1),
        np.diagonal(v, offset=-5, axis1=3, axis2=1),
    )

    assert_eq(
        da.diagonal(v, offset=-6, axis1=3, axis2=1),
        np.diagonal(v, offset=-6, axis1=3, axis2=1),
    )

    assert_eq(
        da.diagonal(v, offset=-6, axis1=-3, axis2=1),
        np.diagonal(v, offset=-6, axis1=-3, axis2=1),
    )

    assert_eq(
        da.diagonal(v, offset=-6, axis1=-3, axis2=1),
        np.diagonal(v, offset=-6, axis1=-3, axis2=1),
    )

    v = da.from_array(v, chunks=2)
    assert_eq(
        da.diagonal(v, offset=1, axis1=3, axis2=1),
        np.diagonal(v, offset=1, axis1=3, axis2=1),
    )
    assert_eq(
        da.diagonal(v, offset=-1, axis1=3, axis2=1),
        np.diagonal(v, offset=-1, axis1=3, axis2=1),
    )

    v = np.arange(384).reshape((8, 8, 6))
    assert_eq(da.diagonal(v, offset=-1, axis1=2),
              np.diagonal(v, offset=-1, axis1=2))

    v = da.from_array(v, chunks=(4, 4, 2))
    assert_eq(da.diagonal(v, offset=-1, axis1=2),
              np.diagonal(v, offset=-1, axis1=2))
Exemple #50
0
def _create_data(objective,
                 n_samples=100,
                 output='array',
                 chunk_size=50,
                 **kwargs):
    if objective.endswith('classification'):
        if objective == 'binary-classification':
            centers = [[-4, -4], [4, 4]]
        elif objective == 'multiclass-classification':
            centers = [[-4, -4], [4, 4], [-4, 4]]
        else:
            raise ValueError(f"Unknown classification task '{objective}'")
        X, y = make_blobs(n_samples=n_samples,
                          centers=centers,
                          random_state=42)
    elif objective == 'regression':
        X, y = make_regression(n_samples=n_samples, random_state=42)
    elif objective == 'ranking':
        return _create_ranking_data(n_samples=n_samples,
                                    output=output,
                                    chunk_size=chunk_size,
                                    **kwargs)
    else:
        raise ValueError("Unknown objective '%s'" % objective)
    rnd = np.random.RandomState(42)
    weights = rnd.random(X.shape[0]) * 0.01

    if output == 'array':
        dX = da.from_array(X, (chunk_size, X.shape[1]))
        dy = da.from_array(y, chunk_size)
        dw = da.from_array(weights, chunk_size)
    elif output.startswith('dataframe'):
        X_df = pd.DataFrame(
            X, columns=['feature_%d' % i for i in range(X.shape[1])])
        if output == 'dataframe-with-categorical':
            num_cat_cols = 5
            for i in range(num_cat_cols):
                col_name = "cat_col" + str(i)
                cat_values = rnd.choice(['a', 'b'], X.shape[0])
                cat_series = pd.Series(cat_values, dtype='category')
                X_df[col_name] = cat_series
                X = np.hstack((X, cat_series.cat.codes.values.reshape(-1, 1)))

            # for the small data sizes used in tests, it's hard to get LGBMRegressor to choose
            # categorical features for splits. So for regression tests with categorical features,
            # _create_data() returns a DataFrame with ONLY categorical features
            if objective == 'regression':
                cat_cols = [
                    col for col in X_df.columns if col.startswith('cat_col')
                ]
                X_df = X_df[cat_cols]
                X = X[:, -num_cat_cols:]
        y_df = pd.Series(y, name='target')
        dX = dd.from_pandas(X_df, chunksize=chunk_size)
        dy = dd.from_pandas(y_df, chunksize=chunk_size)
        dw = dd.from_array(weights, chunksize=chunk_size)
    elif output == 'scipy_csr_matrix':
        dX = da.from_array(X, chunks=(chunk_size,
                                      X.shape[1])).map_blocks(csr_matrix)
        dy = da.from_array(y, chunks=chunk_size)
        dw = da.from_array(weights, chunk_size)
    else:
        raise ValueError("Unknown output type '%s'" % output)

    return X, y, weights, None, dX, dy, dw, None
Exemple #51
0
def test_tril_triu_errors():
    A = np.random.randint(0, 11, (10, 10, 10))
    dA = da.from_array(A, chunks=(5, 5, 5))
    pytest.raises(ValueError, lambda: da.triu(dA))
Exemple #52
0
def test_issignedinf():
    arr = np.random.randint(-1, 2, size=(20, 20)).astype(float) / 0
    darr = da.from_array(arr, 3)

    assert_eq(np.isneginf(arr), da.isneginf(darr))
    assert_eq(np.isposinf(arr), da.isposinf(darr))
Exemple #53
0
def fit(model, x, y, compute=True, **kwargs):
    """ Fit scikit learn model against dask arrays

    Model must support the ``partial_fit`` interface for online or batch
    learning.

    This method will be called on dask arrays in sequential order.  Ideally
    your rows are independent and identically distributed.

    Parameters
    ----------
    model: sklearn model
        Any model supporting partial_fit interface
    x: dask Array
        Two dimensional array, likely tall and skinny
    y: dask Array
        One dimensional array with same chunks as x's rows
    kwargs:
        options to pass to partial_fit

    Examples
    --------
    >>> import dask.array as da
    >>> X = da.random.random((10, 3), chunks=(5, 3))
    >>> y = da.random.randint(0, 2, 10, chunks=(5,))

    >>> from sklearn.linear_model import SGDClassifier
    >>> sgd = SGDClassifier()

    >>> sgd = da.learn.fit(sgd, X, y, classes=[1, 0])
    >>> sgd  # doctest: +SKIP
    SGDClassifier(alpha=0.0001, class_weight=None, epsilon=0.1, eta0=0.0,
           fit_intercept=True, l1_ratio=0.15, learning_rate='optimal',
           loss='hinge', n_iter=5, n_jobs=1, penalty='l2', power_t=0.5,
           random_state=None, shuffle=False, verbose=0, warm_start=False)

    This passes all of X and y through the classifier sequentially.  We can use
    the classifier as normal on in-memory data

    >>> import numpy as np
    >>> sgd.predict(np.random.random((4, 3)))  # doctest: +SKIP
    array([1, 0, 0, 1])

    Or predict on a larger dataset

    >>> z = da.random.random((400, 3), chunks=(100, 3))
    >>> da.learn.predict(sgd, z)  # doctest: +SKIP
    dask.array<x_11, shape=(400,), chunks=((100, 100, 100, 100),), dtype=int64>
    """
    assert x.ndim == 2
    if isinstance(x, np.ndarray):
        x = da.from_array(x, chunks=x.shape)
    if isinstance(y, np.ndarray):
        y = da.from_array(y, chunks=y.shape)
    if y is not None:
        assert y.ndim == 1
        assert x.chunks[0] == y.chunks[0]
    assert hasattr(model, 'partial_fit')
    if len(x.chunks[1]) > 1:
        x = x.rechunk(chunks=(x.chunks[0], sum(x.chunks[1])))

    nblocks = len(x.chunks[0])

    name = 'fit-' + dask.base.tokenize(model, x, y, kwargs)
    dsk = {(name, -1): model}
    dsk.update({(name, i): (_partial_fit, (name, i - 1), (x.name, i, 0),
                            (getattr(y, 'name', ''), i), kwargs)
                for i in range(nblocks)})

    new_dsk = dask.sharedict.merge((name, dsk), x.dask, getattr(y, 'dask', {}))
    value = Delayed((name, nblocks - 1), new_dsk)

    if compute:
        return value.compute()
    else:
        return value
Exemple #54
0
def main(args):
    if args.psf_pars is None:
        print("Attempting to take psf_pars from residual fits header")
        try:
            rhdr = fits.getheader(args.residual)
        except KeyError:
            raise RuntimeError("Either provide a residual with beam "
                               "information or pass them in using --psf_pars "
                               "argument")
        if 'BMAJ1' in rhdr.keys():
            emaj = rhdr['BMAJ1']
            emin = rhdr['BMIN1']
            pa = rhdr['BPA1']
            gaussparf = (emaj, emin, pa)
        elif 'BMAJ' in rhdr.keys():
            emaj = rhdr['BMAJ']
            emin = rhdr['BMIN']
            pa = rhdr['BPA']
            gaussparf = (emaj, emin, pa)
    else:
        gaussparf = tuple(args.psf_pars)

    if args.circ_psf:
        e = (gaussparf[0] + gaussparf[1]) / 2.0
        gaussparf[0] = e
        gaussparf[1] = e

    print("Using emaj = %3.2e, emin = %3.2e, PA = %3.2e \n" % gaussparf)

    # load model image
    model = load_fits(args.model, dtype=args.out_dtype)
    model = model.squeeze()
    orig_shape = model.shape
    mhdr = fits.getheader(args.model)

    l_coord, ref_l = data_from_header(mhdr, axis=1)
    l_coord -= ref_l
    m_coord, ref_m = data_from_header(mhdr, axis=2)
    m_coord -= ref_m
    if mhdr["CTYPE4"].lower() == 'freq':
        freq_axis = 4
    elif mhdr["CTYPE3"].lower() == 'freq':
        freq_axis = 3
    else:
        raise ValueError("Freq axis must be 3rd or 4th")

    mfs_shape = list(orig_shape)
    mfs_shape[0] = 1
    mfs_shape = tuple(mfs_shape)
    freqs, ref_freq = data_from_header(mhdr, axis=freq_axis)

    nband = freqs.size
    if nband < 2:
        raise ValueError("Can't produce alpha map from a single band image")
    npix_l = l_coord.size
    npix_m = m_coord.size

    # update cube psf-pars
    for i in range(1, nband + 1):
        mhdr['BMAJ' + str(i)] = gaussparf[0]
        mhdr['BMIN' + str(i)] = gaussparf[1]
        mhdr['BPA' + str(i)] = gaussparf[2]

    if args.ref_freq is not None and args.ref_freq != ref_freq:
        ref_freq = args.ref_freq
        print(
            'Provided reference frequency does not match that of fits file. Will overwrite.'
        )

    print("Cube frequencies:")
    with np.printoptions(precision=2):
        print(freqs)
    print("Reference frequency is %3.2e Hz \n" % ref_freq)

    # LB - new header for cubes if ref_freqs differ
    new_hdr = set_header_info(mhdr, ref_freq, freq_axis, args, gaussparf)

    # save next to model if no outfile is provided
    if args.output_filename is None:
        # strip .fits from model filename
        tmp = args.model[::-1]
        idx = tmp.find('.')
        outfile = args.model[0:-(idx + 1)]
    else:
        outfile = args.output_filename

    xx, yy = np.meshgrid(l_coord, m_coord, indexing='ij')

    # load beam
    if args.beam_model is not None:
        bhdr = fits.getheader(args.beam_model)
        l_coord_beam, ref_lb = data_from_header(bhdr, axis=1)
        l_coord_beam -= ref_lb
        if not np.array_equal(l_coord_beam, l_coord):
            raise ValueError(
                "l coordinates of beam model do not match those of image. Use power_beam_maker to interpolate to fits header."
            )

        m_coord_beam, ref_mb = data_from_header(bhdr, axis=2)
        m_coord_beam -= ref_mb
        if not np.array_equal(m_coord_beam, m_coord):
            raise ValueError(
                "m coordinates of beam model do not match those of image. Use power_beam_maker to interpolate to fits header."
            )

        freqs_beam, _ = data_from_header(bhdr, axis=freq_axis)
        if not np.array_equal(freqs, freqs_beam):
            raise ValueError(
                "Freqs of beam model do not match those of image. Use power_beam_maker to interpolate to fits header."
            )

        beam_image = load_fits(args.beam_model,
                               dtype=args.out_dtype).reshape(model.shape)
    else:
        beam_image = np.ones(model.shape, dtype=args.out_dtype)

    # do beam correction LB - TODO: use forward model instead
    beammin = np.amin(beam_image, axis=0)[None, :, :]
    model = np.where(beammin >= args.pb_min, model / beam_image, 0.0)

    if not args.dont_convolve:
        print("Computing clean beam")
        # convolve model to desired resolution
        model, gausskern = convolve2gaussres(model, xx, yy, gaussparf,
                                             args.ncpu, None,
                                             args.padding_frac)

        # save clean beam
        if 'c' in args.products:
            name = outfile + '.clean_psf.fits'
            save_fits(name,
                      gausskern.reshape(mfs_shape),
                      new_hdr,
                      dtype=args.out_dtype)
            print("Wrote clean psf to %s \n" % name)

        # save convolved model
        if 'm' in args.products:
            name = outfile + '.convolved_model.fits'
            save_fits(name,
                      model.reshape(orig_shape),
                      new_hdr,
                      dtype=args.out_dtype)
            print("Wrote convolved model to %s \n" % name)

    # add in residuals and set threshold
    if args.residual is not None:
        resid = load_fits(args.residual, dtype=args.out_dtype).squeeze()
        rhdr = fits.getheader(args.residual)
        l_res, ref_lb = data_from_header(rhdr, axis=1)
        l_res -= ref_lb
        if not np.array_equal(l_res, l_coord):
            raise ValueError(
                "l coordinates of residual do not match those of model")

        m_res, ref_mb = data_from_header(rhdr, axis=2)
        m_res -= ref_mb
        if not np.array_equal(m_res, m_coord):
            raise ValueError(
                "m coordinates of residual do not match those of model")

        freqs_res, _ = data_from_header(rhdr, axis=freq_axis)
        if not np.array_equal(freqs, freqs_res):
            raise ValueError("Freqs of residual do not match those of model")

        # convolve residual to same resolution as model
        gausspari = ()
        for i in range(1, nband + 1):
            key = 'BMAJ' + str(i)
            if key in rhdr.keys():
                emaj = rhdr[key]
                emin = rhdr['BMIN' + str(i)]
                pa = rhdr['BPA' + str(i)]
                gausspari += ((emaj, emin, pa), )
            else:
                print(
                    "Can't find Gausspars in residual header, unable to add residuals back in"
                )
                gausspari = None
                break

        if gausspari is not None and args.add_convolved_residuals:
            resid, _ = convolve2gaussres(resid,
                                         xx,
                                         yy,
                                         gaussparf,
                                         args.ncpu,
                                         gausspari,
                                         args.padding_frac,
                                         norm_kernel=True)
            model += resid
            print("Convolved residuals added to convolved model")

            if 'c' in args.products:
                name = outfile + '.convolved_residual.fits'
                save_fits(name, resid.reshape(orig_shape), rhdr)
                print("Wrote convolved residuals to %s" % name)

        counts = np.sum(resid != 0)
        rms = np.sqrt(np.sum(resid**2) / counts)
        rms_cube = np.std(resid.reshape(nband, npix_l * npix_m),
                          axis=1).ravel()
        threshold = args.threshold * rms
        print("Setting cutoff threshold as %i times the rms "
              "of the residual " % args.threshold)
        del resid
    else:
        print("No residual provided. Setting  threshold i.t.o dynamic range. "
              "Max dynamic range is %i " % args.maxDR)
        threshold = model.max() / args.maxDR
        rms_cube = None

    print("Threshold set to %f Jy. \n" % threshold)

    # get pixels above threshold
    minimage = np.amin(model, axis=0)
    maskindices = np.argwhere(minimage > threshold)
    if not maskindices.size:
        raise ValueError("No components found above threshold. "
                         "Try lowering your threshold."
                         "Max of convolved model is %3.2e" % model.max())
    fitcube = model[:, maskindices[:, 0], maskindices[:, 1]].T

    # set weights for fit
    if rms_cube is not None:
        print("Using RMS in each imaging band to determine weights. \n")
        weights = np.where(rms_cube > 0, 1.0 / rms_cube**2, 0.0)
        # normalise
        weights /= weights.max()
    else:
        if args.channel_weights is not None:
            weights = np.array(args.channel_weights)
            print("Using provided channel weights \n")
        else:
            print(
                "No residual or channel weights provided. Using equal weights. \n"
            )
            weights = np.ones(nband, dtype=np.float64)

    ncomps, _ = fitcube.shape
    fitcube = da.from_array(fitcube.astype(np.float64),
                            chunks=(ncomps // args.ncpu, nband))
    weights = da.from_array(weights.astype(np.float64), chunks=(nband))
    freqsdask = da.from_array(freqs.astype(np.float64), chunks=(nband))

    print("Fitting %i components" % ncomps)
    alpha, alpha_err, Iref, i0_err = fit_spi_components(
        fitcube, weights, freqsdask, np.float64(ref_freq)).compute()
    print("Done. Writing output. \n")

    alphamap = np.zeros(model[0].shape, dtype=model.dtype)
    alpha_err_map = np.zeros(model[0].shape, dtype=model.dtype)
    i0map = np.zeros(model[0].shape, dtype=model.dtype)
    i0_err_map = np.zeros(model[0].shape, dtype=model.dtype)
    alphamap[maskindices[:, 0], maskindices[:, 1]] = alpha
    alpha_err_map[maskindices[:, 0], maskindices[:, 1]] = alpha_err
    i0map[maskindices[:, 0], maskindices[:, 1]] = Iref
    i0_err_map[maskindices[:, 0], maskindices[:, 1]] = i0_err

    if 'I' in args.products:
        # get the reconstructed cube
        Irec_cube = i0map[None, :, :] * \
            (freqs[:, None, None]/ref_freq)**alphamap[None, :, :]
        name = outfile + '.Irec_cube.fits'
        save_fits(name,
                  Irec_cube.reshape(orig_shape),
                  mhdr,
                  dtype=args.out_dtype)
        print("Wrote reconstructed cube to %s" % name)

    # save alpha map
    if 'a' in args.products:
        name = outfile + '.alpha.fits'
        save_fits(name,
                  alphamap.reshape(mfs_shape),
                  mhdr,
                  dtype=args.out_dtype)
        print("Wrote alpha map to %s" % name)

    # save alpha error map
    if 'e' in args.products:
        name = outfile + '.alpha_err.fits'
        save_fits(name,
                  alpha_err_map.reshape(mfs_shape),
                  mhdr,
                  dtype=args.out_dtype)
        print("Wrote alpha error map to %s" % name)

    # save I0 map
    if 'i' in args.products:
        name = outfile + '.I0.fits'
        save_fits(name, i0map.reshape(mfs_shape), mhdr, dtype=args.out_dtype)
        print("Wrote I0 map to %s" % name)

    # save I0 error map
    if 'k' in args.products:
        name = outfile + '.I0_err.fits'
        save_fits(name,
                  i0_err_map.reshape(mfs_shape),
                  mhdr,
                  dtype=args.out_dtype)
        print("Wrote I0 error map to %s" % name)

    print(' \n ')

    print("All done here")
    assert hasattr(gs, "dask_graph_")

    with tmpdir() as d:
        gs.visualize(filename=os.path.join(d, "mydask"))
        assert os.path.exists(os.path.join(d, "mydask.png"))

    # Doesn't work if not fitted
    gs = dcv.GridSearchCV(clf, grid)
    with pytest.raises(NotFittedError):
        gs.visualize()


np_X = np.random.normal(size=(20, 3))
np_y = np.random.randint(2, size=20)
np_groups = np.random.permutation(list(range(5)) * 4)
da_X = da.from_array(np_X, chunks=(3, 3))
da_y = da.from_array(np_y, chunks=3)
da_groups = da.from_array(np_groups, chunks=3)
del_X = delayed(np_X)
del_y = delayed(np_y)
del_groups = delayed(np_groups)


@pytest.mark.parametrize(
    ["cls", "has_shuffle"],
    [
        (KFold, True),
        (GroupKFold, False),
        (StratifiedKFold, True),
        (TimeSeriesSplit, False),
    ],
Exemple #56
0
def test_multiple_list_slicing():
    x = np.random.rand(6, 7, 8)
    a = da.from_array(x, chunks=(3, 3, 3))
    assert_eq(x[:, [0, 1, 2]][[0, 1]], a[:, [0, 1, 2]][[0, 1]])
Exemple #57
0
def test_confusion_matrix_binary(client, chunks):
    y_true = da.from_array(cp.array([0, 1, 0, 1]), chunks=chunks)
    y_pred = da.from_array(cp.array([1, 1, 1, 0]), chunks=chunks)
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    ref = cp.array([0, 2, 1, 1])
    cp.testing.assert_array_equal(ref, cp.array([tn, fp, fn, tp]))
Exemple #58
0
def test_confusion_matrix(client, chunks):
    y_true = da.from_array(cp.array([2, 0, 2, 2, 0, 1]), chunks=chunks)
    y_pred = da.from_array(cp.array([0, 0, 2, 2, 0, 2]), chunks=chunks)
    cm = confusion_matrix(y_true, y_pred)
    ref = cp.array([[2, 0, 0], [0, 0, 1], [1, 0, 2]])
    cp.testing.assert_array_equal(cm, ref)
Exemple #59
0
    def read_band(self, key, info):
        """Read the data."""
        tic = datetime.now()
        header = {}
        with open(self.filename, "rb") as fp_:

            header['block1'] = np.fromfile(fp_,
                                           dtype=_BASIC_INFO_TYPE,
                                           count=1)
            header["block2"] = np.fromfile(fp_, dtype=_DATA_INFO_TYPE, count=1)
            header["block3"] = np.fromfile(fp_, dtype=_PROJ_INFO_TYPE, count=1)
            header["block4"] = np.fromfile(fp_, dtype=_NAV_INFO_TYPE, count=1)
            header["block5"] = np.fromfile(fp_, dtype=_CAL_INFO_TYPE, count=1)
            logger.debug("Band number = " +
                         str(header["block5"]['band_number'][0]))
            logger.debug('Time_interval: %s - %s', str(self.start_time),
                         str(self.end_time))
            band_number = header["block5"]['band_number'][0]
            if band_number < 7:
                cal = np.fromfile(fp_, dtype=_VISCAL_INFO_TYPE, count=1)
            else:
                cal = np.fromfile(fp_, dtype=_IRCAL_INFO_TYPE, count=1)

            header['calibration'] = cal

            header["block6"] = np.fromfile(fp_,
                                           dtype=_INTER_CALIBRATION_INFO_TYPE,
                                           count=1)
            header["block7"] = np.fromfile(fp_,
                                           dtype=_SEGMENT_INFO_TYPE,
                                           count=1)
            header["block8"] = np.fromfile(
                fp_, dtype=_NAVIGATION_CORRECTION_INFO_TYPE, count=1)
            # 8 The navigation corrections:
            ncorrs = header["block8"]['numof_correction_info_data'][0]
            dtype = np.dtype([
                ("line_number_after_rotation", "<u2"),
                ("shift_amount_for_column_direction", "f4"),
                ("shift_amount_for_line_direction", "f4"),
            ])
            corrections = []
            for i in range(ncorrs):
                corrections.append(np.fromfile(fp_, dtype=dtype, count=1))
            fp_.seek(40, 1)
            header['navigation_corrections'] = corrections
            header["block9"] = np.fromfile(fp_,
                                           dtype=_OBS_TIME_INFO_TYPE,
                                           count=1)
            numobstimes = header["block9"]['number_of_observation_times'][0]

            dtype = np.dtype([
                ("line_number", "<u2"),
                ("observation_time", "f8"),
            ])
            lines_and_times = []
            for i in range(numobstimes):
                lines_and_times.append(np.fromfile(fp_, dtype=dtype, count=1))
            header['observation_time_information'] = lines_and_times
            fp_.seek(40, 1)

            header["block10"] = np.fromfile(fp_,
                                            dtype=_ERROR_INFO_TYPE,
                                            count=1)
            dtype = np.dtype([
                ("line_number", "<u2"),
                ("numof_error_pixels_per_line", "<u2"),
            ])
            num_err_info_data = header["block10"]['number_of_error_info_data'][
                0]
            err_info_data = []
            for i in range(num_err_info_data):
                err_info_data.append(np.fromfile(fp_, dtype=dtype, count=1))
            header['error_information_data'] = err_info_data
            fp_.seek(40, 1)

            np.fromfile(fp_, dtype=_SPARE_TYPE, count=1)

            nlines = int(header["block2"]['number_of_lines'][0])
            ncols = int(header["block2"]['number_of_columns'][0])

            res = da.from_array(np.memmap(self.filename,
                                          offset=fp_.tell(),
                                          dtype='<u2',
                                          shape=(nlines, ncols),
                                          mode='r'),
                                chunks=CHUNK_SIZE)
        res = da.where(res == 65535, np.float32(np.nan), res)
        self._header = header

        logger.debug("Reading time " + str(datetime.now() - tic))
        res = self.calibrate(res, key.calibration)
        new_info = dict(
            units=info['units'],
            standard_name=info['standard_name'],
            wavelength=info['wavelength'],
            resolution='resolution',
            id=key,
            name=key.name,
            scheduled_time=self.scheduled_time,
            platform_name=self.platform_name,
            sensor=self.sensor,
            satellite_longitude=float(self.nav_info['SSP_longitude']),
            satellite_latitude=float(self.nav_info['SSP_latitude']),
            satellite_altitude=float(
                self.nav_info['distance_earth_center_to_satellite'] -
                self.proj_info['earth_equatorial_radius']) * 1000)
        res = xr.DataArray(res, attrs=new_info, dims=['y', 'x'])
        res = res.where(
            header['block5']["count_value_outside_scan_pixels"][0] != res)
        res = res.where(header['block5']["count_value_error_pixels"][0] != res)
        res = res.where(self.geo_mask())
        return res
Exemple #60
0
def from_bcolz(x,
               chunksize=None,
               categorize=True,
               index=None,
               lock=lock,
               **kwargs):
    """ Read BColz CTable into a Dask Dataframe

    BColz is a fast on-disk compressed column store with careful attention
    given to compression.  https://bcolz.readthedocs.io/en/latest/

    Parameters
    ----------
    x : bcolz.ctable
    chunksize : int, optional
        The size of blocks to pull out from ctable.
    categorize : bool, defaults to True
        Automatically categorize all string dtypes
    index : string, optional
        Column to make the index
    lock: bool or Lock
        Lock to use when reading or False for no lock (not-thread-safe)

    See Also
    --------
    from_array: more generic function not optimized for bcolz
    """
    if lock is True:
        lock = Lock()

    import dask.array as da
    import bcolz

    if isinstance(x, (str, unicode)):
        x = bcolz.ctable(rootdir=x)
    bc_chunklen = max(x[name].chunklen for name in x.names)
    if chunksize is None and bc_chunklen > 10000:
        chunksize = bc_chunklen

    categories = dict()
    if categorize:
        for name in x.names:
            if (np.issubdtype(x.dtype[name], np.string_)
                    or np.issubdtype(x.dtype[name], np.unicode_)
                    or np.issubdtype(x.dtype[name], np.object_)):
                a = da.from_array(x[name], chunks=(chunksize * len(x.names), ))
                categories[name] = da.unique(a)

    columns = tuple(x.dtype.names)
    divisions = tuple(range(0, len(x), chunksize))
    divisions = divisions + (len(x) - 1, )
    if x.rootdir:
        token = tokenize((x.rootdir, os.path.getmtime(x.rootdir)), chunksize,
                         categorize, index, kwargs)
    else:
        token = tokenize((id(x), x.shape, x.dtype), chunksize, categorize,
                         index, kwargs)
    new_name = 'from_bcolz-' + token

    dsk = dict(((new_name, i), (dataframe_from_ctable, x,
                                (slice(i * chunksize, (i + 1) * chunksize), ),
                                columns, categories, lock))
               for i in range(0, int(ceil(len(x) / chunksize))))

    meta = dataframe_from_ctable(x, slice(0, 0), columns, categories, lock)
    result = DataFrame(dsk, new_name, meta, divisions)

    if index:
        assert index in x.names
        a = da.from_array(x[index], chunks=(chunksize * len(x.names), ))
        q = np.linspace(0, 100, len(x) // chunksize + 2)
        divisions = tuple(da.percentile(a, q).compute())
        return set_partition(result, index, divisions, **kwargs)
    else:
        return result