Example #1
0
def test_tokenize_numpy_memmap():
    with tmpfile('.npy') as fn:
        x = np.arange(5)
        np.save(fn, x)
        y = tokenize(np.load(fn, mmap_mode='r'))

    with tmpfile('.npy') as fn:
        x = np.arange(5)
        np.save(fn, x)
        z = tokenize(np.load(fn, mmap_mode='r'))

    assert y != z

    with tmpfile('.npy') as fn:
        x = np.random.normal(size=(10, 10))
        np.save(fn, x)
        mm = np.load(fn, mmap_mode='r')
        mm2 = np.load(fn, mmap_mode='r')
        a = tokenize(mm[0, :])
        b = tokenize(mm[1, :])
        c = tokenize(mm[0:3, :])
        d = tokenize(mm[:, 0])
        assert len(set([a, b, c, d])) == 4
        assert tokenize(mm) == tokenize(mm2)
        assert tokenize(mm[1, :]) == tokenize(mm2[1, :])
Example #2
0
def test_tokenize_pandas_no_pickle():
    class NoPickle(object):
        # pickling not supported because it is a local class
        pass

    df = pd.DataFrame({'x': ['foo', None, NoPickle()]})
    tokenize(df)
Example #3
0
def test_frompyfunc_wrapper():
    f = da_frompyfunc(add, 2, 1)
    np_f = np.frompyfunc(add, 2, 1)
    x = np.array([1, 2, 3])

    # Callable
    np.testing.assert_equal(f(x, 1), np_f(x, 1))

    # picklable
    f2 = pickle.loads(pickle.dumps(f))
    np.testing.assert_equal(f2(x, 1), np_f(x, 1))

    # Attributes
    assert f.ntypes == np_f.ntypes
    with pytest.raises(AttributeError):
        f.not_an_attribute

    # Tab completion
    assert 'ntypes' in dir(f)

    # Methods
    np.testing.assert_equal(f.outer(x, x), np_f.outer(x, x))

    # funcname
    assert f.__name__ == 'frompyfunc-add'

    # repr
    assert repr(f) == "da.frompyfunc<add, 2, 1>"

    # tokenize
    assert (tokenize(da_frompyfunc(add, 2, 1)) ==
            tokenize(da_frompyfunc(add, 2, 1)))
Example #4
0
def test_tokenize_numpy_matrix():
    rng = np.random.RandomState(1234)
    a = np.asmatrix(rng.rand(100))
    b = a.copy()
    assert tokenize(a) == tokenize(b)

    b[:10] = 1
    assert tokenize(a) != tokenize(b)
Example #5
0
def test_tokenize_numpy_ufunc_consistent():
    assert tokenize(np.sin) == '02106e2c67daf452fb480d264e0dac21'
    assert tokenize(np.cos) == 'c99e52e912e4379882a9a4b387957a0b'

    # Make a ufunc that isn't in the numpy namespace. Similar to
    # any found in other packages.
    inc = np.frompyfunc(lambda x: x + 1, 1, 1)
    assert tokenize(inc) == tokenize(inc)
Example #6
0
def test_tokenize_same_repr():
    class Foo(object):
        def __init__(self, x):
            self.x = x
        def __repr__(self):
            return 'a foo'

    assert tokenize(Foo(1)) != tokenize(Foo(2))
Example #7
0
def test_tokenize_ordered_dict():
    with ignoring(ImportError):
        from collections import OrderedDict
        a = OrderedDict([('a', 1), ('b', 2)])
        b = OrderedDict([('a', 1), ('b', 2)])
        c = OrderedDict([('b', 2), ('a', 1)])

        assert tokenize(a) == tokenize(b)
        assert tokenize(a) != tokenize(c)
Example #8
0
def test_tokenize_masked_array():
    m = np.ma.masked_array([1, 2, 3], mask=[True, True, False], fill_value=10)
    m2 = np.ma.masked_array([1, 2, 3], mask=[True, True, False], fill_value=0)
    m3 = np.ma.masked_array([1, 2, 3], mask=False, fill_value=10)
    assert tokenize(m) == tokenize(m)
    assert tokenize(m2) == tokenize(m2)
    assert tokenize(m3) == tokenize(m3)
    assert tokenize(m) != tokenize(m2)
    assert tokenize(m) != tokenize(m3)
Example #9
0
def test_tokenize_numpy_memmap_no_filename():
    # GH 1562:
    with tmpfile('.npy') as fn1, tmpfile('.npy') as fn2:
        x = np.arange(5)
        np.save(fn1, x)
        np.save(fn2, x)

        a = np.load(fn1, mmap_mode='r')
        b = a + a
        assert tokenize(b) == tokenize(b)
Example #10
0
def test_predefined_split():
    cv = PredefinedSplit(np.array(list(range(4)) * 5))
    cv2 = PredefinedSplit(np.array(list(range(5)) * 4))
    assert tokenize(cv) == tokenize(cv)
    assert tokenize(cv) != tokenize(cv2)

    sol = cv.get_n_splits(np_X, np_y, np_groups)
    assert compute_n_splits(cv, np_X, np_y, np_groups) == sol

    with assert_dask_compute(False):
        assert compute_n_splits(cv, da_X, da_y, da_groups) == sol
Example #11
0
def test_old_style_cv():
    cv1 = _CVIterableWrapper([np.array([True, False, True, False] * 5),
                              np.array([False, True, False, True] * 5)])
    cv2 = _CVIterableWrapper([np.array([True, False, True, False] * 5),
                              np.array([False, True, True, True] * 5)])
    assert tokenize(cv1) == tokenize(cv1)
    assert tokenize(cv1) != tokenize(cv2)

    sol = cv1.get_n_splits(np_X, np_y, np_groups)
    assert compute_n_splits(cv1, np_X, np_y, np_groups) == sol
    with assert_dask_compute(False):
        assert compute_n_splits(cv1, da_X, da_y, da_groups) == sol
Example #12
0
def test_tokenize_numpy_memmap():
    with tmpfile('.npy') as fn:
        x = np.arange(5)
        np.save(fn, x)
        y = tokenize(np.load(fn, mmap_mode='r'))

    with tmpfile('.npy') as fn:
        x = np.arange(5)
        np.save(fn, x)
        z = tokenize(np.load(fn, mmap_mode='r'))

    assert y != z
def test_tokenize_entityset():
    es = make_ecommerce_entityset()
    dupe = make_ecommerce_entityset()
    int_es = make_ecommerce_entityset(with_integer_time_index=True)

    # check identitcal entitysets hash to same token
    assert tokenize(es) == tokenize(dupe)

    # not same if product relationship is missing
    productless = make_ecommerce_entityset()
    productless.relationships.pop()
    assert tokenize(es) != tokenize(productless)

    # not same if integer entityset
    assert tokenize(es) != tokenize(int_es)

    # add row to cohorts
    cohorts_df = dupe['cohorts'].df
    new_row = pd.DataFrame(data={'cohort': [2],
                                 'cohort_name': ['On Time Adopters'],
                                 'cohort_end': [pd.Timestamp('2011-04-08 12:00:00')]},
                           columns=['cohort', 'cohort_name', 'cohort_end'],
                           index=[2])
    more_cohorts = cohorts_df.append(new_row, ignore_index=True, sort=True)
    dupe['cohorts'].update_data(more_cohorts)
    assert tokenize(es) == tokenize(dupe)
Example #14
0
def _stack(futures, axis=0, executor=None):
    executor = default_executor(executor)
    assert isinstance(futures, (list, tuple))
    assert all(isinstance(f, Future) for f in futures)  # flat list

    shapes = executor.map(lambda x: x.shape, futures[:10])
    dtype = executor.submit(get_dtype, futures[0])

    shapes, dtype = yield executor._gather([shapes, dtype])

    shape = shapes[0]
    assert all(shape == s for s in shapes)

    slc = ((slice(None),) * axis
         + (None,)
         + (slice(None),) * (len(shape) - axis))

    chunks = (tuple((shape[i],) for i in range(axis))
            + ((1,) * len(futures),)
            + tuple((shape[i],) for i in range(axis, len(shape))))

    name = 'stack-futures' + tokenize(*futures)
    keys = list(product([name], *[range(len(c)) for c in chunks]))
    dsk = {k: (getitem, f, slc) for k, f in zip(keys, futures)}

    raise gen.Return(da.Array(dsk, name, chunks, dtype))
Example #15
0
def test_leave_group_out(cvs):
    tokens = []
    for cv in cvs:
        assert tokenize(cv) == tokenize(cv)
        tokens.append(cv)
    assert len(set(tokens)) == len(tokens)

    cv = cvs[0]
    sol = cv.get_n_splits(np_X, np_y, np_groups)
    assert compute_n_splits(cv, np_X, np_y, np_groups) == sol

    with assert_dask_compute(True):
        assert compute_n_splits(cv, da_X, da_y, da_groups) == sol

    with assert_dask_compute(False):
        assert compute_n_splits(cv, da_X, da_y, np_groups) == sol
Example #16
0
def broadcast_to_workers(workers, data, report=False, rpc=rpc):
    """ Broadcast data directly to all workers

    This sends all data to every worker.

    Currently this works inefficiently by sending all data out directly from
    the scheduler.  In the future we should have the workers communicate
    amongst themselves.

    Parameters
    ----------
    workers: sequence of (host, port) pairs
    data: sequence of data

    See Also
    --------
    scatter_to_workers
    """
    if isinstance(data, dict):
        names, data = list(zip(*data.items()))
    else:
        names = []
        for x in data:
            try:
                names.append(tokenize(x))
            except:
                names.append(str(uuid.uuid1()))
        data = dict(zip(names, data))

    out = yield All([rpc(ip=w_ip, port=w_port).update_data(data=data,
                                                           report=report)
                     for (w_ip, w_port) in workers])
    nbytes = merge([o[1]['nbytes'] for o in out])

    raise Return((names, nbytes))
Example #17
0
File: api.py Project: OXPHOS/xarray
    def maybe_decode_store(store, lock=False):
        ds = conventions.decode_cf(
            store, mask_and_scale=mask_and_scale, decode_times=decode_times,
            concat_characters=concat_characters, decode_coords=decode_coords,
            drop_variables=drop_variables)

        if chunks is not None:
            try:
                from dask.base import tokenize
            except ImportError:
                import dask  # raise the usual error if dask is entirely missing
                if dask.__version__ < '0.6':
                    raise ImportError('xarray requires dask version 0.6 or newer')
                else:
                    raise

            if (isinstance(filename_or_obj, basestring) and
                    not is_remote_uri(filename_or_obj)):
                file_arg = os.path.getmtime(filename_or_obj)
            else:
                file_arg = filename_or_obj
            token = tokenize(file_arg, group, decode_cf, mask_and_scale,
                             decode_times, concat_characters, decode_coords,
                             engine, chunks, drop_variables)
            name_prefix = '%s:%s/' % (filename_or_obj, group or '')
            ds2 = ds.chunk(chunks, name_prefix=name_prefix, token=token,
                           lock=lock)
            ds2._file_obj = ds._file_obj
        else:
            ds2 = ds

        return ds2
Example #18
0
def line(glyph, df, schema, canvas, summary):
    shape, bounds, st, axis = shape_bounds_st_and_axis(df, canvas, glyph)

    # Compile functions
    create, info, append, combine, finalize = compile_components(summary,
                                                                 schema)
    x_mapper = canvas.x_axis.mapper
    y_mapper = canvas.y_axis.mapper
    extend = glyph._build_extend(x_mapper, y_mapper, info, append)

    def chunk(df, df2=None):
        plot_start = True
        if df2 is not None:
            df = pd.concat([df.iloc[-1:], df2])
            plot_start = False
        aggs = create(shape)
        extend(aggs, df, st, bounds, plot_start=plot_start)
        return aggs

    name = tokenize(df._name, canvas, glyph, summary)
    old_name = df._name
    dsk = {(name, 0): (chunk, (old_name, 0))}
    for i in range(1, df.npartitions):
        dsk[(name, i)] = (chunk, (old_name, i - 1), (old_name, i))
    keys2 = [(name, i) for i in range(df.npartitions)]
    dsk[name] = (apply, finalize, [(combine, keys2)],
                 dict(coords=axis, dims=['y_axis', 'x_axis']))
    return dsk, name
Example #19
0
def dask_pipeline(df, schema, canvas, glyph, summary):
    create, info, append, combine, finalize = compile_components(summary,
                                                                 schema)
    extend = glyph._build_extend(info, append)

    x_range = canvas.x_range or compute_x_bounds(glyph, df)
    y_range = canvas.y_range or compute_y_bounds(glyph, df)
    x_min, x_max, y_min, y_max = compute(*(x_range + y_range))
    x_range, y_range = (x_min, x_max), (y_min, y_max)
    df = subselect(glyph, df, canvas)

    vt = canvas.view_transform(x_range, y_range)
    shape = (canvas.plot_height, canvas.plot_width)

    def chunk(df):
        aggs = create(shape)
        extend(aggs, df, vt)
        return aggs

    name = tokenize(df._name, canvas, glyph, summary)
    keys = df._keys()
    keys2 = [(name, i) for i in range(len(keys))]
    dsk = dict((k2, (chunk, k)) for (k2, k) in zip(keys2, keys))
    dsk[name] = (finalize, (combine, keys2))
    dsk.update(df.dask)
    dsk = df._optimize(dsk, name)

    get = _globals['get'] or df._default_get

    return get(dsk, name)
Example #20
0
    def score(self, X, y):
        self.predict(X)
        names = self.get_predict_keys(X)
        name = ('score', tokenize(names[-1], y))
        dsk = {name: (accuracy_score, names[-1], y)}

        return Value(name, [dsk, self.dask.copy()])
Example #21
0
    def maybe_decode_store(store, lock=False):
        ds = conventions.decode_cf(
            store, mask_and_scale=mask_and_scale, decode_times=decode_times,
            concat_characters=concat_characters, decode_coords=decode_coords,
            drop_variables=drop_variables)

        _protect_dataset_variables_inplace(ds, cache)

        if chunks is not None:
            from dask.base import tokenize
            # if passed an actual file path, augment the token with
            # the file modification time
            if (isinstance(filename_or_obj, basestring) and
                    not is_remote_uri(filename_or_obj)):
                mtime = os.path.getmtime(filename_or_obj)
            else:
                mtime = None
            token = tokenize(filename_or_obj, mtime, group, decode_cf,
                             mask_and_scale, decode_times, concat_characters,
                             decode_coords, engine, chunks, drop_variables)
            name_prefix = 'open_dataset-%s' % token
            ds2 = ds.chunk(chunks, name_prefix=name_prefix, token=token,
                           lock=lock)
            ds2._file_obj = ds._file_obj
        else:
            ds2 = ds

        # protect so that dataset store isn't necessarily closed, e.g.,
        # streams like BytesIO  can't be reopened
        # datastore backend is responsible for determining this capability
        if store._autoclose:
            store.close()

        return ds2
Example #22
0
    def _get_solar_flux_old(self, band):
        # TODO: this could be replaced with vectorized indexing in the future.
        from dask.base import tokenize
        blocksize = CHUNK_SIZE

        solar_flux = self.cal['solar_flux'].isel(bands=band).values
        d_index = self.cal['detector_index'].fillna(0).astype(int)

        shape = d_index.shape
        vchunks = range(0, shape[0], blocksize)
        hchunks = range(0, shape[1], blocksize)

        token = tokenize(band, d_index, solar_flux)
        name = 'solar_flux_' + token

        def get_items(array, slices):
            return solar_flux[d_index[slices].values]

        dsk = {(name, i, j): (get_items,
                              d_index,
                              (slice(vcs, min(vcs + blocksize, shape[0])),
                               slice(hcs, min(hcs + blocksize, shape[1]))))
               for i, vcs in enumerate(vchunks)
               for j, hcs in enumerate(hchunks)
               }

        res = da.Array(dsk, name, shape=shape,
                       chunks=(blocksize, blocksize),
                       dtype=solar_flux.dtype)
        return res
Example #23
0
    def submit(self, func, *args, **kwargs):
        """ Submit a function application to the scheduler

        Parameters
        ----------
        func: callable
        *args:
        **kwargs:
        pure: bool (defaults to True)
            Whether or not the function is pure.  Set ``pure=False`` for
            impure functions like ``np.random.random``.
        workers: set, iterable of sets
            A set of worker hostnames on which computations may be performed.
            Leave empty to default to all workers (common case)

        Examples
        --------
        >>> c = executor.submit(add, a, b)  # doctest: +SKIP

        Returns
        -------
        Future

        See Also
        --------
        distributed.executor.Executor.submit:
        """
        if not callable(func):
            raise TypeError("First input to submit must be a callable function")

        key = kwargs.pop('key', None)
        pure = kwargs.pop('pure', True)
        workers = kwargs.pop('workers', None)

        if key is None:
            if pure:
                key = funcname(func) + '-' + tokenize(func, kwargs, *args)
            else:
                key = funcname(func) + '-' + next(tokens)

        if key in self.futures:
            return Future(key, self)

        if kwargs:
            task = (apply, func, args, kwargs)
        else:
            task = (func,) + args

        if workers is not None:
            restrictions = {key: workers}
        else:
            restrictions = {}

        logger.debug("Submit %s(...), %s", funcname(func), key)
        self.send_to_scheduler({'op': 'update-graph',
                                'dsk': {key: task},
                                'keys': [key],
                                'restrictions': restrictions})

        return Future(key, self)
Example #24
0
def build_graph(estimator, cv, scorer, candidate_params, X, y=None,
                groups=None, fit_params=None, iid=True, refit=True,
                error_score='raise', return_train_score=True, cache_cv=True):

    X, y, groups = to_indexable(X, y, groups)
    cv = check_cv(cv, y, is_classifier(estimator))
    # "pairwise" estimators require a different graph for CV splitting
    is_pairwise = getattr(estimator, '_pairwise', False)

    dsk = {}
    X_name, y_name, groups_name = to_keys(dsk, X, y, groups)
    n_splits = compute_n_splits(cv, X, y, groups)

    if fit_params:
        # A mapping of {name: (name, graph-key)}
        param_values = to_indexable(*fit_params.values(), allow_scalars=True)
        fit_params = {k: (k, v) for (k, v) in
                      zip(fit_params, to_keys(dsk, *param_values))}
    else:
        fit_params = {}

    fields, tokens, params = normalize_params(candidate_params)
    main_token = tokenize(normalize_estimator(estimator), fields, params,
                          X_name, y_name, groups_name, fit_params, cv,
                          error_score == 'raise', return_train_score)

    cv_name = 'cv-split-' + main_token
    dsk[cv_name] = (cv_split, cv, X_name, y_name, groups_name,
                    is_pairwise, cache_cv)

    if iid:
        weights = 'cv-n-samples-' + main_token
        dsk[weights] = (cv_n_samples, cv_name)
    else:
        weights = None

    scores = do_fit_and_score(dsk, main_token, estimator, cv_name, fields,
                              tokens, params, X_name, y_name, fit_params,
                              n_splits, error_score, scorer,
                              return_train_score)

    cv_results = 'cv-results-' + main_token
    candidate_params_name = 'cv-parameters-' + main_token
    dsk[candidate_params_name] = (decompress_params, fields, params)
    dsk[cv_results] = (create_cv_results, scores, candidate_params_name,
                       n_splits, error_score, weights)
    keys = [cv_results]

    if refit:
        best_params = 'best-params-' + main_token
        dsk[best_params] = (get_best_params, candidate_params_name, cv_results)
        best_estimator = 'best-estimator-' + main_token
        if fit_params:
            fit_params = (dict, (zip, list(fit_params.keys()),
                                list(pluck(1, fit_params.values()))))
        dsk[best_estimator] = (fit_best, clone(estimator), best_params,
                               X_name, y_name, fit_params)
        keys.append(best_estimator)

    return dsk, keys, n_splits
Example #25
0
    def maybe_decode_store(store, lock=False):
        ds = conventions.decode_cf(
            store, mask_and_scale=mask_and_scale, decode_times=decode_times,
            concat_characters=concat_characters, decode_coords=decode_coords,
            drop_variables=drop_variables)

        _protect_dataset_variables_inplace(ds, cache)

        if chunks is not None:
            from dask.base import tokenize
            # if passed an actual file path, augment the token with
            # the file modification time
            if (isinstance(filename_or_obj, basestring) and
                    not is_remote_uri(filename_or_obj)):
                mtime = os.path.getmtime(filename_or_obj)
            else:
                mtime = None
            token = tokenize(filename_or_obj, mtime, group, decode_cf,
                             mask_and_scale, decode_times, concat_characters,
                             decode_coords, engine, chunks, drop_variables)
            name_prefix = 'open_dataset-%s' % token
            ds2 = ds.chunk(chunks, name_prefix=name_prefix, token=token)
            ds2._file_obj = ds._file_obj
        else:
            ds2 = ds

        return ds2
Example #26
0
def test_tokenize_sequences():
    assert tokenize([1]) != tokenize([2])
    assert tokenize([1]) != tokenize((1,))
    assert tokenize([1]) == tokenize([1])

    x = np.arange(2000)  # long enough to drop information in repr
    y = np.arange(2000)
    y[1000] = 0  # middle isn't printed in repr
    assert tokenize([x]) != tokenize([y])
Example #27
0
def test_tokenize_numpy_array_on_object_dtype():
    assert (tokenize(np.array(['a', 'aa', 'aaa'], dtype=object)) ==
            tokenize(np.array(['a', 'aa', 'aaa'], dtype=object)))
    assert (tokenize(np.array(['a', None, 'aaa'], dtype=object)) ==
            tokenize(np.array(['a', None, 'aaa'], dtype=object)))
    assert (tokenize(np.array([(1, 'a'), (1, None), (1, 'aaa')], dtype=object)) ==
            tokenize(np.array([(1, 'a'), (1, None), (1, 'aaa')], dtype=object)))
    if sys.version_info[0] == 2:
        assert (tokenize(np.array([unicode("Rebeca Alón", encoding="utf-8")], dtype=object)) ==
                tokenize(np.array([unicode("Rebeca Alón", encoding="utf-8")], dtype=object)))
    def _scatter(self, data, workers=None, broadcast=False):
        """ Scatter data to local data dictionary

        Rather than send data out to the cluster we keep data local.  However
        we do report to the scheduler that the local worker has the scattered
        data.  This allows other workers to come by and steal this data if
        desired.

        Keywords like ``broadcast=`` do not work, however operations like
        ``.replicate`` work fine after calling scatter, which can fill in for
        this functionality.
        """
        with log_errors():
            if not (workers is None and broadcast is False):
                raise NotImplementedError("Scatter from worker doesn't support workers or broadcast keywords")

            if isinstance(data, dict) and not all(isinstance(k, (bytes, str))
                                                   for k in data):
                d = yield self._scatter(keymap(tokey, data), workers, broadcast)
                raise gen.Return({k: d[tokey(k)] for k in data})

            if isinstance(data, (list, tuple, set, frozenset)):
                keys = []
                for x in data:
                    try:
                        keys.append(tokenize(x))
                    except:
                        keys.append(str(uuid.uuid1()))
                data2 = dict(zip(keys, data))
            elif isinstance(data, dict):
                keys = set(data)
                data2 = data
            else:
                raise TypeError("Don't know how to scatter %s" % type(data))

            nbytes = valmap(sizeof, data2)

            # self.worker.data.update(data2)  # thread safety matters
            self.worker.loop.add_callback(self.worker.data.update, data2)

            yield self.scheduler.update_data(
                    who_has={key: [self.worker.address] for key in data2},
                    nbytes=valmap(sizeof, data2),
                    client=self.id)

            if isinstance(data, dict):
                out = {k: Future(k, self) for k in data}
            elif isinstance(data, (tuple, list, set, frozenset)):
                out = type(data)([Future(k, self) for k in keys])
            else:
                raise TypeError(
                        "Input to scatter must be a list or dict")

            for key in keys:
                self.futures[key]['status'] = 'finished'
                self.futures[key]['event'].set()

            raise gen.Return(out)
Example #29
0
    def compute(self, *args, **kwargs):
        """ Compute dask collections on cluster

        Parameters
        ----------
        args: iterable of dask objects
            Collections like dask.array or dataframe or dask.value objects
        sync: bool (optional)
            Returns Futures if False (default) or concrete values if True

        Returns
        -------
        Tuple of Futures or concrete values

        Examples
        --------

        >>> from dask import do, value
        >>> from operator import add
        >>> x = dask.do(add)(1, 2)
        >>> y = dask.do(add)(x, x)
        >>> xx, yy = executor.compute(x, y)  # doctest: +SKIP
        >>> xx  # doctest: +SKIP
        <Future: status: finished, key: add-8f6e709446674bad78ea8aeecfee188e>
        >>> xx.result()  # doctest: +SKIP
        3
        >>> yy.result()  # doctest: +SKIP
        6
        """
        sync = kwargs.pop('sync', False)
        assert not kwargs
        if sync:
            return dask.compute(*args, get=self.get)

        variables = [a for a in args if isinstance(a, Base)]

        groups = groupby(lambda x: x._optimize, variables)
        dsk = merge([opt(merge([v.dask for v in val]),
                         [v._keys() for v in val])
                    for opt, val in groups.items()])
        names = ['finalize-%s' % tokenize(v) for v in variables]
        dsk2 = {name: (v._finalize, v, v._keys()) for name, v in zip(names, variables)}

        self.loop.add_callback(self.scheduler_queue.put_nowait,
                                {'op': 'update-graph',
                                'dsk': merge(dsk, dsk2),
                                'keys': names})

        i = 0
        futures = []
        for arg in args:
            if isinstance(arg, Base):
                futures.append(Future(names[i], self))
                i += 1
            else:
                futures.append(arg)

        return futures
Example #30
0
def _futures_to_dask_bag(futures, executor=None):
    executor = default_executor(executor)

    name = 'bag-from-futures-' + tokenize(*futures)
    dsk = {(name, i): future for i, future in enumerate(futures)}

    ensure_default_get(executor)

    raise gen.Return(db.Bag(dsk, name, len(futures)))
Example #31
0
def test_tokenize_pandas_invalid_unicode():
    # see https://github.com/dask/dask/issues/2713
    df = pd.DataFrame(
        {"x\ud83d": [1, 2, 3], "y\ud83d": ["4", "asd\ud83d", None]}, index=[1, 2, 3]
    )
    tokenize(df)
Example #32
0
def test_tokenize_set():
    assert tokenize({1, 2, "x", (1, "x")}) == tokenize({1, 2, "x", (1, "x")})
Example #33
0
def test_tokenize_dict():
    assert tokenize({"x": 1, 1: "x"}) == tokenize({"x": 1, 1: "x"})
Example #34
0
    def _construct_collection_plan(cls, dataset_info):

        # Collect necessary information from dataset_info
        fs = dataset_info["fs"]
        parts = dataset_info["parts"]
        paths = dataset_info["paths"]
        filters = dataset_info["filters"]
        pf = dataset_info["pf"]
        split_row_groups = dataset_info["split_row_groups"]
        chunksize = dataset_info["chunksize"]
        gather_statistics = dataset_info["gather_statistics"]
        base_path = dataset_info["base"]
        aggregation_depth = dataset_info["aggregation_depth"]
        index_cols = dataset_info["index_cols"]
        categories = dataset_info["categories"]
        dtypes = dataset_info["dtypes"]
        categories_dict = dataset_info["categories_dict"]
        has_metadata_file = dataset_info["has_metadata_file"]
        metadata_task_size = dataset_info["metadata_task_size"]
        kwargs = dataset_info["kwargs"]

        # Ensure metadata_task_size is set
        # (Using config file or defaults)
        metadata_task_size = _set_metadata_task_size(
            dataset_info["metadata_task_size"], fs)

        # Determine which columns need statistics.
        # At this point, gather_statistics is only True if
        # the user specified calculate_divisions=True
        filter_columns = {t[0] for t in flatten(filters or [], container=list)}
        stat_col_indices = {}
        _index_cols = index_cols if (gather_statistics
                                     and len(index_cols) == 1) else []
        for i, name in enumerate(pf.columns):
            if name in _index_cols or name in filter_columns:
                stat_col_indices[name] = i

        # Decide final `gather_statistics` setting.
        # NOTE: The "fastparquet" engine requires statistics for
        # filtering even if the filter is on a paritioned column
        gather_statistics = _set_gather_statistics(
            gather_statistics,
            chunksize,
            split_row_groups,
            aggregation_depth,
            filter_columns,
            set(stat_col_indices) | filter_columns,
        )

        # Define common_kwargs
        common_kwargs = {
            "categories": categories_dict or categories,
            "root_cats": pf.cats,
            "root_file_scheme": pf.file_scheme,
            "base_path": base_path,
            **kwargs,
        }

        # Check if this is a very simple case where we can just
        # return the path names. This requires that `parts`
        # already be a list of paths. Also, we cannot be splitting
        # by row-group or collecting statistics.
        if (gather_statistics is False and not split_row_groups
                and isinstance(parts, list) and len(parts)
                and isinstance(parts[0], str)):
            return (
                [{
                    "piece": (full_path, None)
                } for full_path in parts],
                [],
                common_kwargs,
            )

        dataset_info_kwargs = {
            "fs": fs,
            "split_row_groups": split_row_groups,
            "gather_statistics": gather_statistics,
            "filters": filters,
            "dtypes": dtypes,
            "stat_col_indices": stat_col_indices,
            "aggregation_depth": aggregation_depth,
            "chunksize": chunksize,
            "root_cats": pf.cats,
            "root_file_scheme": pf.file_scheme,
            "base_path": "" if base_path is None else base_path,
            "has_metadata_file": has_metadata_file,
        }

        if (has_metadata_file or metadata_task_size == 0
                or metadata_task_size > len(paths)):
            # Construct the output-partitioning plan on the
            # client process (in serial).  This means we have
            # a global _metadata file, or that `metadata_task_size`
            # is zero or larger than the number of files.
            pf_or_paths = pf if has_metadata_file else paths
            parts, stats = cls._collect_file_parts(pf_or_paths,
                                                   dataset_info_kwargs)

        else:
            # We DON'T have a global _metadata file to work with.
            # We should loop over files in parallel
            parts, stats = [], []
            if paths:
                # Build and compute a task graph to construct stats/parts
                gather_parts_dsk = {}
                name = "gather-pq-parts-" + tokenize(paths,
                                                     dataset_info_kwargs)
                finalize_list = []
                for task_i, file_i in enumerate(
                        range(0, len(paths), metadata_task_size)):
                    finalize_list.append((name, task_i))
                    gather_parts_dsk[finalize_list[-1]] = (
                        cls._collect_file_parts,
                        paths[file_i:file_i + metadata_task_size],
                        dataset_info_kwargs,
                    )

                def _combine_parts(parts_and_stats):
                    parts, stats = [], []
                    for part, stat in parts_and_stats:
                        parts += part
                        if stat:
                            stats += stat
                    return parts, stats

                gather_parts_dsk["final-" + name] = (_combine_parts,
                                                     finalize_list)
                parts, stats = Delayed("final-" + name,
                                       gather_parts_dsk).compute()

        return parts, stats, common_kwargs
Example #35
0
def test_tokenize_pandas_mixed_unicode_bytes():
    df = pd.DataFrame(
        {u"ö".encode("utf8"): [1, 2, 3], u"ö": [u"ö", u"ö".encode("utf8"), None]},
        index=[1, 2, 3],
    )
    tokenize(df)
Example #36
0
 def ukey(self, path):
     adl_path = self._trim_filename(path)
     return tokenize(self.info(adl_path)['modificationTime'])
Example #37
0
def test_tokenize_object_array_with_nans():
    a = np.array(["foo", "Jos\xe9", np.nan], dtype="O")
    assert tokenize(a) == tokenize(a)
Example #38
0
def test_tokenize_numpy_scalar():
    assert tokenize(np.array(1.0, dtype="f8")) == tokenize(np.array(1.0, dtype="f8"))
    assert tokenize(
        np.array([(1, 2)], dtype=[("a", "i4"), ("b", "i8")])[0]
    ) == tokenize(np.array([(1, 2)], dtype=[("a", "i4"), ("b", "i8")])[0])
Example #39
0
def test_tokenize():
    a = (1, 2, 3)
    assert isinstance(tokenize(a), (str, bytes))
Example #40
0
def test_tokenize_literal():
    assert tokenize(literal(["x", 1])) == tokenize(literal(["x", 1]))
Example #41
0
def test_tokenize_pandas_index():
    idx = pd.Index(["a", "b"])
    assert tokenize(idx) == tokenize(idx)

    idx = pd.MultiIndex.from_product([["a", "b"], [0, 1]])
    assert tokenize(idx) == tokenize(idx)
Example #42
0
def open_rasterio(
    filename,
    parse_coordinates=None,
    chunks=None,
    cache=None,
    lock=None,
    **kwargs,
):
    """Open a file with rasterio (experimental).

    This should work with any file that rasterio can open (most often:
    geoTIFF). The x and y coordinates are generated automatically from the
    file's geoinformation, shifted to the center of each pixel (see
    `"PixelIsArea" Raster Space
    <http://web.archive.org/web/20160326194152/http://remotesensing.org/geotiff/spec/geotiff2.5.html#2.5.2>`_
    for more information).

    You can generate 2D coordinates from the file's attributes with::

        >>> from affine import Affine
        >>> da = xr.open_rasterio(
        ...     "https://github.com/mapbox/rasterio/raw/1.2.1/tests/data/RGB.byte.tif"
        ... )
        >>> da
        <xarray.DataArray (band: 3, y: 718, x: 791)>
        [1703814 values with dtype=uint8]
        Coordinates:
          * band     (band) int64 1 2 3
          * y        (y) float64 2.827e+06 2.826e+06 2.826e+06 ... 2.612e+06 2.612e+06
          * x        (x) float64 1.021e+05 1.024e+05 1.027e+05 ... 3.389e+05 3.392e+05
        Attributes:
            transform:      (300.0379266750948, 0.0, 101985.0, 0.0, -300.041782729805...
            crs:            +init=epsg:32618
            res:            (300.0379266750948, 300.041782729805)
            is_tiled:       0
            nodatavals:     (0.0, 0.0, 0.0)
            scales:         (1.0, 1.0, 1.0)
            offsets:        (0.0, 0.0, 0.0)
            AREA_OR_POINT:  Area
        >>> transform = Affine(*da.attrs["transform"])
        >>> transform
        Affine(300.0379266750948, 0.0, 101985.0,
               0.0, -300.041782729805, 2826915.0)
        >>> nx, ny = da.sizes["x"], da.sizes["y"]
        >>> x, y = transform * np.meshgrid(np.arange(nx) + 0.5, np.arange(ny) + 0.5)
        >>> x
        array([[102135.01896334, 102435.05689001, 102735.09481669, ...,
                338564.90518331, 338864.94310999, 339164.98103666],
               [102135.01896334, 102435.05689001, 102735.09481669, ...,
                338564.90518331, 338864.94310999, 339164.98103666],
               [102135.01896334, 102435.05689001, 102735.09481669, ...,
                338564.90518331, 338864.94310999, 339164.98103666],
               ...,
               [102135.01896334, 102435.05689001, 102735.09481669, ...,
                338564.90518331, 338864.94310999, 339164.98103666],
               [102135.01896334, 102435.05689001, 102735.09481669, ...,
                338564.90518331, 338864.94310999, 339164.98103666],
               [102135.01896334, 102435.05689001, 102735.09481669, ...,
                338564.90518331, 338864.94310999, 339164.98103666]])

    Parameters
    ----------
    filename : str, rasterio.DatasetReader, or rasterio.WarpedVRT
        Path to the file to open. Or already open rasterio dataset.
    parse_coordinates : bool, optional
        Whether to parse the x and y coordinates out of the file's
        ``transform`` attribute or not. The default is to automatically
        parse the coordinates only if they are rectilinear (1D).
        It can be useful to set ``parse_coordinates=False``
        if your files are very large or if you don't need the coordinates.
    chunks : int, tuple or dict, optional
        Chunk sizes along each dimension, e.g., ``5``, ``(5, 5)`` or
        ``{'x': 5, 'y': 5}``. If chunks is provided, it used to load the new
        DataArray into a dask array.
    cache : bool, optional
        If True, cache data loaded from the underlying datastore in memory as
        NumPy arrays when accessed to avoid reading from the underlying data-
        store multiple times. Defaults to True unless you specify the `chunks`
        argument to use dask, in which case it defaults to False.
    lock : False, True or threading.Lock, optional
        If chunks is provided, this argument is passed on to
        :py:func:`dask.array.from_array`. By default, a global lock is
        used to avoid issues with concurrent access to the same file when using
        dask's multithreaded backend.

    Returns
    -------
    data : DataArray
        The newly created DataArray.
    """
    import rasterio
    from rasterio.vrt import WarpedVRT

    vrt_params = None
    if isinstance(filename, rasterio.io.DatasetReader):
        filename = filename.name
    elif isinstance(filename, rasterio.vrt.WarpedVRT):
        vrt = filename
        filename = vrt.src_dataset.name
        vrt_params = dict(
            src_crs=vrt.src_crs.to_string(),
            crs=vrt.crs.to_string(),
            resampling=vrt.resampling,
            tolerance=vrt.tolerance,
            src_nodata=vrt.src_nodata,
            nodata=vrt.nodata,
            width=vrt.width,
            height=vrt.height,
            src_transform=vrt.src_transform,
            transform=vrt.transform,
            dtype=vrt.working_dtype,
            warp_extras=vrt.warp_extras,
        )

    if lock is None:
        lock = RASTERIO_LOCK

    manager = CachingFileManager(
        rasterio.open,
        filename,
        lock=lock,
        mode="r",
        kwargs=kwargs,
    )
    riods = manager.acquire()
    if vrt_params is not None:
        riods = WarpedVRT(riods, **vrt_params)

    if cache is None:
        cache = chunks is None

    coords = {}

    # Get bands
    if riods.count < 1:
        raise ValueError("Unknown dims")
    coords["band"] = np.asarray(riods.indexes)

    # Get coordinates
    if riods.transform.is_rectilinear:
        # 1d coordinates
        parse = True if parse_coordinates is None else parse_coordinates
        if parse:
            nx, ny = riods.width, riods.height
            # xarray coordinates are pixel centered
            x, _ = riods.transform * (np.arange(nx) + 0.5, np.zeros(nx) + 0.5)
            _, y = riods.transform * (np.zeros(ny) + 0.5, np.arange(ny) + 0.5)
            coords["y"] = y
            coords["x"] = x
    else:
        # 2d coordinates
        parse = False if (parse_coordinates is None) else parse_coordinates
        if parse:
            warnings.warn(
                "The file coordinates' transformation isn't "
                "rectilinear: xarray won't parse the coordinates "
                "in this case. Set `parse_coordinates=False` to "
                "suppress this warning.",
                RuntimeWarning,
                stacklevel=3,
            )

    # Attributes
    attrs = {}
    # Affine transformation matrix (always available)
    # This describes coefficients mapping pixel coordinates to CRS
    # For serialization store as tuple of 6 floats, the last row being
    # always (0, 0, 1) per definition (see
    # https://github.com/sgillies/affine)
    attrs["transform"] = tuple(riods.transform)[:6]
    if hasattr(riods, "crs") and riods.crs:
        # CRS is a dict-like object specific to rasterio
        # If CRS is not None, we convert it back to a PROJ4 string using
        # rasterio itself
        try:
            attrs["crs"] = riods.crs.to_proj4()
        except AttributeError:
            attrs["crs"] = riods.crs.to_string()
    if hasattr(riods, "res"):
        # (width, height) tuple of pixels in units of CRS
        attrs["res"] = riods.res
    if hasattr(riods, "is_tiled"):
        # Is the TIF tiled? (bool)
        # We cast it to an int for netCDF compatibility
        attrs["is_tiled"] = np.uint8(riods.is_tiled)
    if hasattr(riods, "nodatavals"):
        # The nodata values for the raster bands
        attrs["nodatavals"] = tuple(np.nan if nodataval is None else nodataval
                                    for nodataval in riods.nodatavals)
    if hasattr(riods, "scales"):
        # The scale values for the raster bands
        attrs["scales"] = riods.scales
    if hasattr(riods, "offsets"):
        # The offset values for the raster bands
        attrs["offsets"] = riods.offsets
    if hasattr(riods, "descriptions") and any(riods.descriptions):
        # Descriptions for each dataset band
        attrs["descriptions"] = riods.descriptions
    if hasattr(riods, "units") and any(riods.units):
        # A list of units string for each dataset band
        attrs["units"] = riods.units

    # Parse extra metadata from tags, if supported
    parsers = {"ENVI": _parse_envi, "GTiff": lambda m: m}

    driver = riods.driver
    if driver in parsers:
        if driver == "GTiff":
            meta = parsers[driver](riods.tags())
        else:
            meta = parsers[driver](riods.tags(ns=driver))

        for k, v in meta.items():
            # Add values as coordinates if they match the band count,
            # as attributes otherwise
            if isinstance(v, (list, np.ndarray)) and len(v) == riods.count:
                coords[k] = ("band", np.asarray(v))
            else:
                attrs[k] = v

    data = indexing.LazilyIndexedArray(
        RasterioArrayWrapper(manager, lock, vrt_params))

    # this lets you write arrays loaded with rasterio
    data = indexing.CopyOnWriteArray(data)
    if cache and chunks is None:
        data = indexing.MemoryCachedArray(data)

    result = DataArray(data=data,
                       dims=("band", "y", "x"),
                       coords=coords,
                       attrs=attrs)

    if chunks is not None:
        from dask.base import tokenize

        # augment the token with the file modification time
        try:
            mtime = os.path.getmtime(filename)
        except OSError:
            # the filename is probably an s3 bucket rather than a regular file
            mtime = None
        token = tokenize(filename, mtime, chunks)
        name_prefix = f"open_rasterio-{token}"
        result = result.chunk(chunks, name_prefix=name_prefix, token=token)

    # Make the file closeable
    result.set_close(manager.close)

    return result
Example #43
0
def test_tokenize_numpy_datetime():
    tokenize(np.array(['2000-01-01T12:00:00'], dtype='M8[ns]'))
Example #44
0
def test_tokenize_object_with_recursion_error_returns_uuid():
    cycle = dict(a=None)
    cycle["a"] = cycle

    assert len(tokenize(cycle)) == 32
Example #45
0
def test_tokenize_kwargs():
    assert tokenize(5, x=1) == tokenize(5, x=1)
    assert tokenize(5) != tokenize(5, x=1)
    assert tokenize(5, x=1) != tokenize(5, x=2)
    assert tokenize(5, x=1) != tokenize(5, y=1)
Example #46
0
def test_tokenize_numpy_array_consistent_on_values():
    assert tokenize(
        np.random.RandomState(1234).random_sample(1000)) == tokenize(
            np.random.RandomState(1234).random_sample(1000))
Example #47
0
def test_tokenize_object_array_with_nans():
    a = np.array([u'foo', u'Jos\xe9', np.nan], dtype='O')
    assert tokenize(a) == tokenize(a)
Example #48
0
def test_tokenize_dict():
    assert tokenize({'x': 1, 1: 'x'}) == tokenize({'x': 1, 1: 'x'})
Example #49
0
def test_tokenize_base_types(x):
    assert tokenize(x) == tokenize(x), x
Example #50
0
def test_custom_collection():
    # Arbitrary hashables
    h1 = object()
    h2 = object()

    dsk = {("x", h1): 1, ("x", h2): 2}
    dsk2 = {
        ("y", h1): (add, ("x", h1), ("x", h2)),
        ("y", h2): (add, ("y", h1), 1)
    }
    dsk2.update(dsk)
    dsk3 = {"z": (add, ("y", h1), ("y", h2))}
    dsk3.update(dsk2)

    w = Tuple({}, [])  # A collection can have no keys at all
    x = Tuple(dsk, [("x", h1), ("x", h2)])
    y = Tuple(dsk2, [("y", h1), ("y", h2)])
    z = Tuple(dsk3, ["z"])
    # Collection with multiple names
    t = w + x + y + z

    # __slots__ defined on base mixin class propagates
    with pytest.raises(AttributeError):
        x.foo = 1

    # is_dask_collection
    assert is_dask_collection(w)
    assert is_dask_collection(x)
    assert is_dask_collection(y)
    assert is_dask_collection(z)
    assert is_dask_collection(t)

    # tokenize
    assert tokenize(w) == tokenize(w)
    assert tokenize(x) == tokenize(x)
    assert tokenize(y) == tokenize(y)
    assert tokenize(z) == tokenize(z)
    assert tokenize(t) == tokenize(t)
    # All tokens are unique
    assert len({tokenize(coll) for coll in (w, x, y, z, t)}) == 5

    # get_collection_names
    assert get_collection_names(w) == set()
    assert get_collection_names(x) == {"x"}
    assert get_collection_names(y) == {"y"}
    assert get_collection_names(z) == {"z"}
    assert get_collection_names(t) == {"x", "y", "z"}

    # compute
    assert w.compute() == ()
    assert x.compute() == (1, 2)
    assert y.compute() == (3, 4)
    assert z.compute() == (7, )
    assert dask.compute(w, [{
        "x": x
    }, y, z]) == ((), [{
        "x": (1, 2)
    }, (3, 4), (7, )])
    assert t.compute() == (1, 2, 3, 4, 7)

    # persist
    t2 = t.persist()
    assert isinstance(t2, Tuple)
    assert t2._keys == t._keys
    assert sorted(t2._dask.values()) == [1, 2, 3, 4, 7]
    assert t2.compute() == (1, 2, 3, 4, 7)

    w2, x2, y2, z2 = dask.persist(w, x, y, z)
    assert y2._keys == y._keys
    assert y2._dask == {("y", h1): 3, ("y", h2): 4}
    assert y2.compute() == (3, 4)

    t3 = x2 + y2 + z2
    assert t3.compute() == (1, 2, 3, 4, 7)

    # __dask_postpersist__ with name change
    rebuild, args = w.__dask_postpersist__()
    w3 = rebuild({}, *args, rename={"w": "w3"})
    assert w3.compute() == ()

    rebuild, args = x.__dask_postpersist__()
    x3 = rebuild({("x3", h1): 10, ("x3", h2): 20}, *args, rename={"x": "x3"})
    assert x3.compute() == (10, 20)

    rebuild, args = z.__dask_postpersist__()
    z3 = rebuild({"z3": 70}, *args, rename={"z": "z3"})
    assert z3.compute() == (70, )
Example #51
0
def reduction(
    args,
    chunk=None,
    aggregate=None,
    combine=None,
    meta=None,
    token=None,
    chunk_kwargs=None,
    aggregate_kwargs=None,
    combine_kwargs=None,
    split_every=None,
    **kwargs,
):
    """Generic tree reduction operation.

    Parameters
    ----------
    args :
        Positional arguments for the `chunk` function. All `dask.dataframe`
        objects should be partitioned and indexed equivalently.
    chunk : function [block-per-arg] -> block
        Function to operate on each block of data
    aggregate : function list-of-blocks -> block
        Function to operate on the list of results of chunk
    combine : function list-of-blocks -> block, optional
        Function to operate on intermediate lists of results of chunk
        in a tree-reduction. If not provided, defaults to aggregate.
    $META
    token : str, optional
        The name to use for the output keys.
    chunk_kwargs : dict, optional
        Keywords for the chunk function only.
    aggregate_kwargs : dict, optional
        Keywords for the aggregate function only.
    combine_kwargs : dict, optional
        Keywords for the combine function only.
    split_every : int, optional
        Group partitions into groups of this size while performing a
        tree-reduction. If set to False, no tree-reduction will be used,
        and all intermediates will be concatenated and passed to ``aggregate``.
        Default is 8.
    kwargs :
        All remaining keywords will be passed to ``chunk``, ``aggregate``, and
        ``combine``.
    """
    if chunk_kwargs is None:
        chunk_kwargs = dict()
    if aggregate_kwargs is None:
        aggregate_kwargs = dict()
    chunk_kwargs.update(kwargs)
    aggregate_kwargs.update(kwargs)

    if combine is None:
        if combine_kwargs:
            raise ValueError("`combine_kwargs` provided with no `combine`")
        combine = aggregate
        combine_kwargs = aggregate_kwargs
    else:
        if combine_kwargs is None:
            combine_kwargs = dict()
        combine_kwargs.update(kwargs)

    if not isinstance(args, (tuple, list)):
        args = [args]

    npartitions = set(
        arg.npartitions for arg in args if isinstance(arg, _Frame)
    )
    if len(npartitions) > 1:
        raise ValueError("All arguments must have same number of partitions")
    npartitions = npartitions.pop()

    if split_every is None:
        split_every = 8
    elif split_every is False:
        split_every = npartitions
    elif split_every < 2 or not isinstance(split_every, int):
        raise ValueError("split_every must be an integer >= 2")

    token_key = tokenize(
        token or (chunk, aggregate),
        meta,
        args,
        chunk_kwargs,
        aggregate_kwargs,
        combine_kwargs,
        split_every,
    )

    # Chunk
    a = "{0}-chunk-{1}".format(token or funcname(chunk), token_key)
    if len(args) == 1 and isinstance(args[0], _Frame) and not chunk_kwargs:
        dsk = {
            (a, 0, i): (chunk, key)
            for i, key in enumerate(args[0].__dask_keys__())
        }
    else:
        dsk = {
            (a, 0, i): (
                apply,
                chunk,
                [(x._name, i) if isinstance(x, _Frame) else x for x in args],
                chunk_kwargs,
            )
            for i in range(args[0].npartitions)
        }

    # Combine
    b = "{0}-combine-{1}".format(token or funcname(combine), token_key)
    k = npartitions
    depth = 0
    while k > split_every:
        for part_i, inds in enumerate(partition_all(split_every, range(k))):
            conc = (list, [(a, depth, i) for i in inds])
            dsk[(b, depth + 1, part_i)] = (
                (apply, combine, [conc], combine_kwargs)
                if combine_kwargs
                else (combine, conc)
            )
        k = part_i + 1
        a = b
        depth += 1

    # Aggregate
    b = "{0}-agg-{1}".format(token or funcname(aggregate), token_key)
    conc = (list, [(a, depth, i) for i in range(k)])
    if aggregate_kwargs:
        dsk[(b, 0)] = (apply, aggregate, [conc], aggregate_kwargs)
    else:
        dsk[(b, 0)] = (aggregate, conc)

    if meta is None:
        meta_chunk = _emulate(apply, chunk, args, chunk_kwargs)
        meta = _emulate(apply, aggregate, [[meta_chunk]], aggregate_kwargs)
    meta = dd.core.make_meta(meta)

    graph = HighLevelGraph.from_collections(b, dsk, dependencies=args)
    return dd.core.new_dd_object(graph, b, meta, (None, None))
Example #52
0
def test_tokenize_discontiguous_numpy_array():
    tokenize(np.random.random(8)[::2])
Example #53
0
def _parallel_var(ddf, meta, skipna, split_every, out):
    def _local_var(x, skipna):
        if skipna:
            n = x.count(skipna=skipna)
            avg = x.mean(skipna=skipna)
        else:
            # Not skipping nulls, so might as well
            # avoid the full `count` operation
            n = len(x)
            avg = x.sum(skipna=skipna) / n
        m2 = ((x - avg) ** 2).sum(skipna=skipna)
        return n, avg, m2

    def _aggregate_var(parts):
        n, avg, m2 = parts[0]
        for i in range(1, len(parts)):
            n_a, avg_a, m2_a = n, avg, m2
            n_b, avg_b, m2_b = parts[i]
            n = n_a + n_b
            avg = (n_a * avg_a + n_b * avg_b) / n
            delta = avg_b - avg_a
            m2 = m2_a + m2_b + delta ** 2 * n_a * n_b / n
        return n, avg, m2

    def _finalize_var(vals):
        n, _, m2 = vals
        return m2 / (n - 1)

    # Build graph
    nparts = ddf.npartitions
    if not split_every:
        split_every = nparts
    name = "var-" + tokenize(skipna, split_every, out)
    local_name = "local-" + name
    num = ddf._get_numeric_data()
    dsk = {
        (local_name, n, 0): (_local_var, (num._name, n), skipna)
        for n in range(nparts)
    }

    # Use reduction tree
    widths = [nparts]
    while nparts > 1:
        nparts = math.ceil(nparts / split_every)
        widths.append(nparts)
    height = len(widths)
    for depth in range(1, height):
        for group in range(widths[depth]):
            p_max = widths[depth - 1]
            lstart = split_every * group
            lstop = min(lstart + split_every, p_max)
            node_list = [
                (local_name, p, depth - 1) for p in range(lstart, lstop)
            ]
            dsk[(local_name, group, depth)] = (_aggregate_var, node_list)
    if height == 1:
        group = depth = 0
    dsk[(name, 0)] = (_finalize_var, (local_name, group, depth))

    graph = HighLevelGraph.from_collections(name, dsk, dependencies=[num, ddf])
    result = dd.core.new_dd_object(graph, name, meta, (None, None))
    if isinstance(ddf, DataFrame):
        result.divisions = (min(ddf.columns), max(ddf.columns))
    return handle_out(out, result)
Example #54
0
def test_tokenize_numpy_datetime():
    tokenize(np.array(["2000-01-01T12:00:00"], dtype="M8[ns]"))
Example #55
0
def affine_transform(image,
                     matrix,
                     offset=0.0,
                     output_shape=None,
                     order=1,
                     output_chunks=None,
                     **kwargs):
    """Apply an affine transform using Dask. For every
    output chunk, only the slice containing the relevant part
    of the image is processed. Chunkwise processing is performed
    either using `ndimage.affine_transform` or
    `cupyx.scipy.ndimage.affine_transform`, depending on the input type.

    Notes
    -----
        Differences to `ndimage.affine_transformation`:
        - currently, prefiltering is not supported
          (affecting the output in case of interpolation `order > 1`)
        - default order is 1
        - modes 'reflect', 'mirror' and 'wrap' are not supported

        Arguments equal to `ndimage.affine_transformation`,
        except for `output_chunks`.

    Parameters
    ----------
    image : array_like (Numpy Array, Cupy Array, Dask Array...)
        The image array.
    matrix : array (ndim,), (ndim, ndim), (ndim, ndim+1) or (ndim+1, ndim+1)
        Transformation matrix.
    offset : float or sequence, optional
        The offset into the array where the transform is applied. If a float,
        `offset` is the same for each axis. If a sequence, `offset` should
        contain one value for each axis.
    output_shape : tuple of ints, optional
        The shape of the array to be returned.
    order : int, optional
        The order of the spline interpolation. Note that for order>1
        scipy's affine_transform applies prefiltering, which is not
        yet supported and skipped in this implementation.
    output_chunks : tuple of ints, optional
        The shape of the chunks of the output Dask Array.

    Returns
    -------
    affine_transform : Dask Array
        A dask array representing the transformed output

    """

    if not type(image) == da.core.Array:
        image = da.from_array(image)

    if output_shape is None:
        output_shape = image.shape

    if output_chunks is None:
        output_chunks = image.shape

    # Perform test run to ensure parameter validity.
    ndimage_affine_transform(np.zeros([0] * image.ndim), matrix, offset)

    # Make sure parameters contained in matrix and offset
    # are not overlapping, i.e. that the offset is valid as
    # it needs to be modified for each chunk.
    # Further parameter checks are performed directly by
    # `ndimage.affine_transform`.

    matrix = np.asarray(matrix)
    offset = np.asarray(offset).squeeze()

    # these lines were copied and adapted from `ndimage.affine_transform`
    if (matrix.ndim == 2 and matrix.shape[1] == image.ndim + 1
            and (matrix.shape[0] in [image.ndim, image.ndim + 1])):

        # assume input is homogeneous coordinate transformation matrix
        offset = matrix[:image.ndim, image.ndim]
        matrix = matrix[:image.ndim, :image.ndim]

    # process kwargs
    # prefilter is not yet supported
    if 'prefilter' in kwargs:
        if kwargs['prefilter'] and order > 1:
            warnings.warn(
                'Currently, `dask_image.ndinterp.affine_transform` '
                'doesn\'t support `prefilter=True`. Proceeding with'
                ' `prefilter=False`, which if order > 1 can lead '
                'to the output containing more blur than with '
                'prefiltering.', UserWarning)
        del kwargs['prefilter']

    if 'mode' in kwargs:
        if kwargs['mode'] in ['wrap', 'reflect', 'mirror']:
            raise (NotImplementedError("Mode %s is not currently supported." %
                                       kwargs['mode']))

    n = image.ndim
    image_shape = image.shape

    # calculate output array properties
    normalized_chunks = da.core.normalize_chunks(output_chunks,
                                                 tuple(output_shape))
    block_indices = product(*(range(len(bds)) for bds in normalized_chunks))
    block_offsets = [np.cumsum((0, ) + bds[:-1]) for bds in normalized_chunks]

    # use dispatching mechanism to determine backend
    affine_transform_method = dispatch_affine_transform(image)
    asarray_method = dispatch_asarray(image)

    # construct dask graph for output array
    # using unique and deterministic identifier
    output_name = 'affine_transform-' + tokenize(
        image, matrix, offset, output_shape, output_chunks, kwargs)
    output_layer = {}
    rel_images = []
    for ib, block_ind in enumerate(block_indices):

        out_chunk_shape = [
            normalized_chunks[dim][block_ind[dim]] for dim in range(n)
        ]
        out_chunk_offset = [
            block_offsets[dim][block_ind[dim]] for dim in range(n)
        ]

        out_chunk_edges = np.array([i for i in np.ndindex(tuple([2] * n))])\
            * np.array(out_chunk_shape) + np.array(out_chunk_offset)

        # map output chunk edges onto input image coordinates
        # to define the input region relevant for the current chunk
        if matrix.ndim == 1 and len(matrix) == image.ndim:
            rel_image_edges = matrix * out_chunk_edges + offset
        else:
            rel_image_edges = np.dot(matrix, out_chunk_edges.T).T + offset

        rel_image_i = np.min(rel_image_edges, 0)
        rel_image_f = np.max(rel_image_edges, 0)

        # Calculate edge coordinates required for the footprint of the
        # spline kernel according to
        # https://github.com/scipy/scipy/blob/9c0d08d7d11fc33311a96d2ac3ad73c8f6e3df00/scipy/ndimage/src/ni_interpolation.c#L412-L419 # noqa: E501
        # Also see this discussion:
        # https://github.com/dask/dask-image/issues/24#issuecomment-706165593 # noqa: E501
        for dim in range(n):

            if order % 2 == 0:
                rel_image_i[dim] += 0.5
                rel_image_f[dim] += 0.5

            rel_image_i[dim] = np.floor(rel_image_i[dim]) - order // 2
            rel_image_f[dim] = np.floor(rel_image_f[dim]) - order // 2 + order

            if order == 0:  # required for consistency with scipy.ndimage
                rel_image_i[dim] -= 1

        # clip image coordinates to image extent
        for dim, s in zip(range(n), image_shape):
            rel_image_i[dim] = np.clip(rel_image_i[dim], 0, s - 1)
            rel_image_f[dim] = np.clip(rel_image_f[dim], 0, s - 1)

        rel_image_slice = tuple([
            slice(int(rel_image_i[dim]),
                  int(rel_image_f[dim]) + 2) for dim in range(n)
        ])

        rel_image = image[rel_image_slice]
        """Block comment for future developers explaining how `offset` is
        transformed into `offset_prime` for each output chunk.
        Modify offset to point into cropped image.
        y = Mx + o
        Coordinate substitution:
        y' = y - y0(min_coord_px)
        x' = x - x0(chunk_offset)
        Then:
        y' = Mx' + o + Mx0 - y0
        M' = M
        o' = o + Mx0 - y0
        """

        offset_prime = offset + np.dot(matrix, out_chunk_offset) - rel_image_i

        output_layer[(output_name, ) + block_ind] = (
            affine_transform_method,
            (da.core.concatenate3, rel_image.__dask_keys__()),
            asarray_method(matrix),
            offset_prime,
            tuple(out_chunk_shape),  # output_shape
            None,  # out
            order,
            'constant' if 'mode' not in kwargs else kwargs['mode'],
            0. if 'cval' not in kwargs else kwargs['cval'],
            False  # prefilter
        )

        rel_images.append(rel_image)

    graph = HighLevelGraph.from_collections(output_name,
                                            output_layer,
                                            dependencies=[image] + rel_images)

    meta = dispatch_asarray(image)([0]).astype(image.dtype)

    transformed = da.Array(
        graph,
        output_name,
        shape=tuple(output_shape),
        # chunks=output_chunks,
        chunks=normalized_chunks,
        meta=meta)

    return transformed
Example #56
0
def open_rasterio(filename, chunks=None, cache=None, lock=None):
    """Open a file with rasterio (experimental).

    This should work with any file that rasterio can open (most often:
    geoTIFF). The x and y coordinates are generated automatically from the
    file's geoinformation, shifted to the center of each pixel (see
    `"PixelIsArea" Raster Space
    <http://web.archive.org/web/20160326194152/http://remotesensing.org/geotiff/spec/geotiff2.5.html#2.5.2>`_
    for more information).

    Parameters
    ----------
    filename : str
        Path to the file to open.

    Returns
    -------
    data : DataArray
        The newly created DataArray.
    chunks : int, tuple or dict, optional
        Chunk sizes along each dimension, e.g., ``5``, ``(5, 5)`` or
        ``{'x': 5, 'y': 5}``. If chunks is provided, it used to load the new
        DataArray into a dask array.
    cache : bool, optional
        If True, cache data loaded from the underlying datastore in memory as
        NumPy arrays when accessed to avoid reading from the underlying data-
        store multiple times. Defaults to True unless you specify the `chunks`
        argument to use dask, in which case it defaults to False.
    lock : False, True or threading.Lock, optional
        If chunks is provided, this argument is passed on to
        :py:func:`dask.array.from_array`. By default, a global lock is
        used to avoid issues with concurrent access to the same file when using
        dask's multithreaded backend.
    """

    import rasterio
    riods = rasterio.open(filename, mode='r')

    if cache is None:
        cache = chunks is None

    coords = OrderedDict()

    # Get bands
    if riods.count < 1:
        raise ValueError('Unknown dims')
    coords['band'] = np.asarray(riods.indexes)

    # Get geo coords
    nx, ny = riods.width, riods.height
    dx, dy = riods.res[0], -riods.res[1]
    x0 = riods.bounds.right if dx < 0 else riods.bounds.left
    y0 = riods.bounds.top if dy < 0 else riods.bounds.bottom
    coords['y'] = np.linspace(start=y0 + dy / 2,
                              num=ny,
                              stop=(y0 + (ny - 1) * dy) + dy / 2)
    coords['x'] = np.linspace(start=x0 + dx / 2,
                              num=nx,
                              stop=(x0 + (nx - 1) * dx) + dx / 2)

    # Attributes
    attrs = {}
    if hasattr(riods, 'crs') and riods.crs:
        # CRS is a dict-like object specific to rasterio
        # If CRS is not None, we convert it back to a PROJ4 string using
        # rasterio itself
        attrs['crs'] = riods.crs.to_string()
    if hasattr(riods, 'res'):
        # (width, height) tuple of pixels in units of CRS
        attrs['res'] = riods.res
    if hasattr(riods, 'is_tiled'):
        # Is the TIF tiled? (bool)
        # We cast it to an int for netCDF compatibility
        attrs['is_tiled'] = np.uint8(riods.is_tiled)
    if hasattr(riods, 'transform'):
        # Affine transformation matrix (tuple of floats)
        # Describes coefficients mapping pixel coordinates to CRS
        attrs['transform'] = tuple(riods.transform)

    data = indexing.LazilyIndexedArray(RasterioArrayWrapper(riods))

    # this lets you write arrays loaded with rasterio
    data = indexing.CopyOnWriteArray(data)
    if cache and (chunks is None):
        data = indexing.MemoryCachedArray(data)

    result = DataArray(data=data,
                       dims=('band', 'y', 'x'),
                       coords=coords,
                       attrs=attrs)

    if chunks is not None:
        from dask.base import tokenize
        # augment the token with the file modification time
        mtime = os.path.getmtime(filename)
        token = tokenize(filename, mtime, chunks)
        name_prefix = 'open_rasterio-%s' % token
        if lock is None:
            lock = RASTERIO_LOCK
        result = result.chunk(chunks,
                              name_prefix=name_prefix,
                              token=token,
                              lock=lock)

    # Make the file closeable
    result._file_obj = riods

    return result
Example #57
0
File: orc.py Project: mnicely/cudf
def read_orc(path, columns=None, storage_options=None, **kwargs):
    """Read cudf dataframe from ORC file(s).

    Note that this function is mostly borrowed from upstream Dask.

    Parameters
    ----------
    path: str or list(str)
        Location of file(s), which can be a full URL with protocol specifier,
        and may include glob character if a single string.
    columns: None or list(str)
        Columns to load. If None, loads all.
    storage_options: None or dict
        Further parameters to pass to the bytes backend.

    Returns
    -------
    cudf.DataFrame
    """

    storage_options = storage_options or {}
    fs, fs_token, paths = get_fs_token_paths(path,
                                             mode="rb",
                                             storage_options=storage_options)
    schema = None
    nstripes_per_file = []
    for path in paths:
        with fs.open(path, "rb") as f:
            o = orc.ORCFile(f)
            if schema is None:
                schema = o.schema
            elif schema != o.schema:
                raise ValueError(
                    "Incompatible schemas while parsing ORC files")
            nstripes_per_file.append(o.nstripes)
    schema = _get_pyarrow_dtypes(schema, categories=None)
    if columns is not None:
        ex = set(columns) - set(schema)
        if ex:
            raise ValueError("Requested columns (%s) not in schema (%s)" %
                             (ex, set(schema)))
    else:
        columns = list(schema)

    with fs.open(paths[0], "rb") as f:
        meta = cudf.read_orc(f, stripes=[0], columns=columns, **kwargs)

    name = "read-orc-" + tokenize(fs_token, path, columns, **kwargs)
    dsk = {}
    N = 0
    for path, n in zip(paths, nstripes_per_file):
        for stripe in range(n):
            dsk[(name, N)] = (
                _read_orc_stripe,
                fs,
                path,
                stripe,
                columns,
                kwargs,
            )
            N += 1

    divisions = [None] * (len(dsk) + 1)
    return dd.core.new_dd_object(dsk, name, meta, divisions)
Example #58
0
def groupby_agg(
    ddf,
    gb_cols,
    aggs_in,
    split_every=None,
    split_out=None,
    dropna=True,
    sep="___",
    sort=False,
    as_index=True,
):
    """ Optimized groupby aggregation for Dask-CuDF.

        This aggregation algorithm only supports the following options:

        - "count"
        - "mean"
        - "std"
        - "var"
        - "sum"
        - "min"
        - "max"
        - "collect"
        - "first"
        - "last"

        This "optimized" approach is more performant than the algorithm
        in `dask.dataframe`, because it allows the cudf backend to
        perform multiple aggregations at once.
    """

    # Deal with default split_out and split_every params
    if split_every is False:
        split_every = ddf.npartitions
    split_every = split_every or 8
    split_out = split_out or 1

    # Standardize `gb_cols` and `columns` lists
    aggs = _redirect_aggs(aggs_in.copy())
    if isinstance(gb_cols, str):
        gb_cols = [gb_cols]
    columns = [c for c in ddf.columns if c not in gb_cols]
    str_cols_out = False
    if isinstance(aggs, dict):
        # Use `str_cols_out` to specify if the output columns
        # will have str (rather than MultiIndex/tuple) names.
        # This happens when all values in the `aggs` dict are
        # strings (no lists)
        str_cols_out = True
        for col in aggs:
            if isinstance(aggs[col], str) or callable(aggs[col]):
                aggs[col] = [aggs[col]]
            else:
                str_cols_out = False
            if col in gb_cols:
                columns.append(col)

    # Assert that aggregations are supported
    _supported = {
        "count",
        "mean",
        "std",
        "var",
        "sum",
        "min",
        "max",
        "collect",
        "first",
        "last",
    }
    if not _is_supported(aggs, _supported):
        raise ValueError(
            f"Supported aggs include {_supported} for groupby_agg API. "
            f"Aggregations must be specified with dict or list syntax."
        )

    # Always convert aggs to dict for consistency
    if isinstance(aggs, list):
        aggs = {col: aggs for col in columns}

    # Begin graph construction
    dsk = {}
    token = tokenize(ddf, gb_cols, aggs)
    partition_agg_name = "groupby_partition_agg-" + token
    tree_reduce_name = "groupby_tree_reduce-" + token
    gb_agg_name = "groupby_agg-" + token
    for p in range(ddf.npartitions):
        # Perform groupby aggregation on each partition.
        # Split each result into `split_out` chunks (by hashing `gb_cols`)
        dsk[(partition_agg_name, p)] = (
            _groupby_partition_agg,
            (ddf._name, p),
            gb_cols,
            aggs,
            columns,
            split_out,
            dropna,
            sort,
            sep,
        )
        # Pick out each chunk using `getitem`
        for s in range(split_out):
            dsk[(tree_reduce_name, p, s, 0)] = (
                getitem,
                (partition_agg_name, p),
                s,
            )

    # Build reduction tree
    parts = ddf.npartitions
    widths = [parts]
    while parts > 1:
        parts = math.ceil(parts / split_every)
        widths.append(parts)
    height = len(widths)
    for s in range(split_out):
        for depth in range(1, height):
            for group in range(widths[depth]):

                p_max = widths[depth - 1]
                lstart = split_every * group
                lstop = min(lstart + split_every, p_max)
                node_list = [
                    (tree_reduce_name, p, s, depth - 1)
                    for p in range(lstart, lstop)
                ]

                dsk[(tree_reduce_name, group, s, depth)] = (
                    _tree_node_agg,
                    node_list,
                    gb_cols,
                    split_out,
                    dropna,
                    sort,
                    sep,
                )

    # Final output partitions.
    _aggs = aggs.copy()
    if str_cols_out:
        # Metadata should use `str` for dict values if that is
        # what the user originally specified (column names will
        # be str, rather than tuples).
        for col in aggs:
            _aggs[col] = _aggs[col][0]
    _meta = ddf._meta.groupby(gb_cols, as_index=as_index).agg(_aggs)
    for s in range(split_out):
        dsk[(gb_agg_name, s)] = (
            _finalize_gb_agg,
            (tree_reduce_name, 0, s, height - 1),
            gb_cols,
            aggs,
            columns,
            _meta.columns,
            as_index,
            sort,
            sep,
            str_cols_out,
        )

    divisions = [None] * (split_out + 1)
    graph = HighLevelGraph.from_collections(
        gb_agg_name, dsk, dependencies=[ddf]
    )
    return new_dd_object(graph, gb_agg_name, _meta, divisions)
Example #59
0
def test_tokenize_numpy_array_supports_uneven_sizes():
    tokenize(np.random.random(7).astype(dtype="i2"))
Example #60
0
def test_tokenize_set():
    assert tokenize({1, 2, 'x', (1, 'x')}) == tokenize({1, 2, 'x', (1, 'x')})