def test_tokenize_numpy_memmap(): with tmpfile('.npy') as fn: x = np.arange(5) np.save(fn, x) y = tokenize(np.load(fn, mmap_mode='r')) with tmpfile('.npy') as fn: x = np.arange(5) np.save(fn, x) z = tokenize(np.load(fn, mmap_mode='r')) assert y != z with tmpfile('.npy') as fn: x = np.random.normal(size=(10, 10)) np.save(fn, x) mm = np.load(fn, mmap_mode='r') mm2 = np.load(fn, mmap_mode='r') a = tokenize(mm[0, :]) b = tokenize(mm[1, :]) c = tokenize(mm[0:3, :]) d = tokenize(mm[:, 0]) assert len(set([a, b, c, d])) == 4 assert tokenize(mm) == tokenize(mm2) assert tokenize(mm[1, :]) == tokenize(mm2[1, :])
def test_tokenize_pandas_no_pickle(): class NoPickle(object): # pickling not supported because it is a local class pass df = pd.DataFrame({'x': ['foo', None, NoPickle()]}) tokenize(df)
def test_frompyfunc_wrapper(): f = da_frompyfunc(add, 2, 1) np_f = np.frompyfunc(add, 2, 1) x = np.array([1, 2, 3]) # Callable np.testing.assert_equal(f(x, 1), np_f(x, 1)) # picklable f2 = pickle.loads(pickle.dumps(f)) np.testing.assert_equal(f2(x, 1), np_f(x, 1)) # Attributes assert f.ntypes == np_f.ntypes with pytest.raises(AttributeError): f.not_an_attribute # Tab completion assert 'ntypes' in dir(f) # Methods np.testing.assert_equal(f.outer(x, x), np_f.outer(x, x)) # funcname assert f.__name__ == 'frompyfunc-add' # repr assert repr(f) == "da.frompyfunc<add, 2, 1>" # tokenize assert (tokenize(da_frompyfunc(add, 2, 1)) == tokenize(da_frompyfunc(add, 2, 1)))
def test_tokenize_numpy_matrix(): rng = np.random.RandomState(1234) a = np.asmatrix(rng.rand(100)) b = a.copy() assert tokenize(a) == tokenize(b) b[:10] = 1 assert tokenize(a) != tokenize(b)
def test_tokenize_numpy_ufunc_consistent(): assert tokenize(np.sin) == '02106e2c67daf452fb480d264e0dac21' assert tokenize(np.cos) == 'c99e52e912e4379882a9a4b387957a0b' # Make a ufunc that isn't in the numpy namespace. Similar to # any found in other packages. inc = np.frompyfunc(lambda x: x + 1, 1, 1) assert tokenize(inc) == tokenize(inc)
def test_tokenize_same_repr(): class Foo(object): def __init__(self, x): self.x = x def __repr__(self): return 'a foo' assert tokenize(Foo(1)) != tokenize(Foo(2))
def test_tokenize_ordered_dict(): with ignoring(ImportError): from collections import OrderedDict a = OrderedDict([('a', 1), ('b', 2)]) b = OrderedDict([('a', 1), ('b', 2)]) c = OrderedDict([('b', 2), ('a', 1)]) assert tokenize(a) == tokenize(b) assert tokenize(a) != tokenize(c)
def test_tokenize_masked_array(): m = np.ma.masked_array([1, 2, 3], mask=[True, True, False], fill_value=10) m2 = np.ma.masked_array([1, 2, 3], mask=[True, True, False], fill_value=0) m3 = np.ma.masked_array([1, 2, 3], mask=False, fill_value=10) assert tokenize(m) == tokenize(m) assert tokenize(m2) == tokenize(m2) assert tokenize(m3) == tokenize(m3) assert tokenize(m) != tokenize(m2) assert tokenize(m) != tokenize(m3)
def test_tokenize_numpy_memmap_no_filename(): # GH 1562: with tmpfile('.npy') as fn1, tmpfile('.npy') as fn2: x = np.arange(5) np.save(fn1, x) np.save(fn2, x) a = np.load(fn1, mmap_mode='r') b = a + a assert tokenize(b) == tokenize(b)
def test_predefined_split(): cv = PredefinedSplit(np.array(list(range(4)) * 5)) cv2 = PredefinedSplit(np.array(list(range(5)) * 4)) assert tokenize(cv) == tokenize(cv) assert tokenize(cv) != tokenize(cv2) sol = cv.get_n_splits(np_X, np_y, np_groups) assert compute_n_splits(cv, np_X, np_y, np_groups) == sol with assert_dask_compute(False): assert compute_n_splits(cv, da_X, da_y, da_groups) == sol
def test_old_style_cv(): cv1 = _CVIterableWrapper([np.array([True, False, True, False] * 5), np.array([False, True, False, True] * 5)]) cv2 = _CVIterableWrapper([np.array([True, False, True, False] * 5), np.array([False, True, True, True] * 5)]) assert tokenize(cv1) == tokenize(cv1) assert tokenize(cv1) != tokenize(cv2) sol = cv1.get_n_splits(np_X, np_y, np_groups) assert compute_n_splits(cv1, np_X, np_y, np_groups) == sol with assert_dask_compute(False): assert compute_n_splits(cv1, da_X, da_y, da_groups) == sol
def test_tokenize_numpy_memmap(): with tmpfile('.npy') as fn: x = np.arange(5) np.save(fn, x) y = tokenize(np.load(fn, mmap_mode='r')) with tmpfile('.npy') as fn: x = np.arange(5) np.save(fn, x) z = tokenize(np.load(fn, mmap_mode='r')) assert y != z
def test_tokenize_entityset(): es = make_ecommerce_entityset() dupe = make_ecommerce_entityset() int_es = make_ecommerce_entityset(with_integer_time_index=True) # check identitcal entitysets hash to same token assert tokenize(es) == tokenize(dupe) # not same if product relationship is missing productless = make_ecommerce_entityset() productless.relationships.pop() assert tokenize(es) != tokenize(productless) # not same if integer entityset assert tokenize(es) != tokenize(int_es) # add row to cohorts cohorts_df = dupe['cohorts'].df new_row = pd.DataFrame(data={'cohort': [2], 'cohort_name': ['On Time Adopters'], 'cohort_end': [pd.Timestamp('2011-04-08 12:00:00')]}, columns=['cohort', 'cohort_name', 'cohort_end'], index=[2]) more_cohorts = cohorts_df.append(new_row, ignore_index=True, sort=True) dupe['cohorts'].update_data(more_cohorts) assert tokenize(es) == tokenize(dupe)
def _stack(futures, axis=0, executor=None): executor = default_executor(executor) assert isinstance(futures, (list, tuple)) assert all(isinstance(f, Future) for f in futures) # flat list shapes = executor.map(lambda x: x.shape, futures[:10]) dtype = executor.submit(get_dtype, futures[0]) shapes, dtype = yield executor._gather([shapes, dtype]) shape = shapes[0] assert all(shape == s for s in shapes) slc = ((slice(None),) * axis + (None,) + (slice(None),) * (len(shape) - axis)) chunks = (tuple((shape[i],) for i in range(axis)) + ((1,) * len(futures),) + tuple((shape[i],) for i in range(axis, len(shape)))) name = 'stack-futures' + tokenize(*futures) keys = list(product([name], *[range(len(c)) for c in chunks])) dsk = {k: (getitem, f, slc) for k, f in zip(keys, futures)} raise gen.Return(da.Array(dsk, name, chunks, dtype))
def test_leave_group_out(cvs): tokens = [] for cv in cvs: assert tokenize(cv) == tokenize(cv) tokens.append(cv) assert len(set(tokens)) == len(tokens) cv = cvs[0] sol = cv.get_n_splits(np_X, np_y, np_groups) assert compute_n_splits(cv, np_X, np_y, np_groups) == sol with assert_dask_compute(True): assert compute_n_splits(cv, da_X, da_y, da_groups) == sol with assert_dask_compute(False): assert compute_n_splits(cv, da_X, da_y, np_groups) == sol
def broadcast_to_workers(workers, data, report=False, rpc=rpc): """ Broadcast data directly to all workers This sends all data to every worker. Currently this works inefficiently by sending all data out directly from the scheduler. In the future we should have the workers communicate amongst themselves. Parameters ---------- workers: sequence of (host, port) pairs data: sequence of data See Also -------- scatter_to_workers """ if isinstance(data, dict): names, data = list(zip(*data.items())) else: names = [] for x in data: try: names.append(tokenize(x)) except: names.append(str(uuid.uuid1())) data = dict(zip(names, data)) out = yield All([rpc(ip=w_ip, port=w_port).update_data(data=data, report=report) for (w_ip, w_port) in workers]) nbytes = merge([o[1]['nbytes'] for o in out]) raise Return((names, nbytes))
def maybe_decode_store(store, lock=False): ds = conventions.decode_cf( store, mask_and_scale=mask_and_scale, decode_times=decode_times, concat_characters=concat_characters, decode_coords=decode_coords, drop_variables=drop_variables) if chunks is not None: try: from dask.base import tokenize except ImportError: import dask # raise the usual error if dask is entirely missing if dask.__version__ < '0.6': raise ImportError('xarray requires dask version 0.6 or newer') else: raise if (isinstance(filename_or_obj, basestring) and not is_remote_uri(filename_or_obj)): file_arg = os.path.getmtime(filename_or_obj) else: file_arg = filename_or_obj token = tokenize(file_arg, group, decode_cf, mask_and_scale, decode_times, concat_characters, decode_coords, engine, chunks, drop_variables) name_prefix = '%s:%s/' % (filename_or_obj, group or '') ds2 = ds.chunk(chunks, name_prefix=name_prefix, token=token, lock=lock) ds2._file_obj = ds._file_obj else: ds2 = ds return ds2
def line(glyph, df, schema, canvas, summary): shape, bounds, st, axis = shape_bounds_st_and_axis(df, canvas, glyph) # Compile functions create, info, append, combine, finalize = compile_components(summary, schema) x_mapper = canvas.x_axis.mapper y_mapper = canvas.y_axis.mapper extend = glyph._build_extend(x_mapper, y_mapper, info, append) def chunk(df, df2=None): plot_start = True if df2 is not None: df = pd.concat([df.iloc[-1:], df2]) plot_start = False aggs = create(shape) extend(aggs, df, st, bounds, plot_start=plot_start) return aggs name = tokenize(df._name, canvas, glyph, summary) old_name = df._name dsk = {(name, 0): (chunk, (old_name, 0))} for i in range(1, df.npartitions): dsk[(name, i)] = (chunk, (old_name, i - 1), (old_name, i)) keys2 = [(name, i) for i in range(df.npartitions)] dsk[name] = (apply, finalize, [(combine, keys2)], dict(coords=axis, dims=['y_axis', 'x_axis'])) return dsk, name
def dask_pipeline(df, schema, canvas, glyph, summary): create, info, append, combine, finalize = compile_components(summary, schema) extend = glyph._build_extend(info, append) x_range = canvas.x_range or compute_x_bounds(glyph, df) y_range = canvas.y_range or compute_y_bounds(glyph, df) x_min, x_max, y_min, y_max = compute(*(x_range + y_range)) x_range, y_range = (x_min, x_max), (y_min, y_max) df = subselect(glyph, df, canvas) vt = canvas.view_transform(x_range, y_range) shape = (canvas.plot_height, canvas.plot_width) def chunk(df): aggs = create(shape) extend(aggs, df, vt) return aggs name = tokenize(df._name, canvas, glyph, summary) keys = df._keys() keys2 = [(name, i) for i in range(len(keys))] dsk = dict((k2, (chunk, k)) for (k2, k) in zip(keys2, keys)) dsk[name] = (finalize, (combine, keys2)) dsk.update(df.dask) dsk = df._optimize(dsk, name) get = _globals['get'] or df._default_get return get(dsk, name)
def score(self, X, y): self.predict(X) names = self.get_predict_keys(X) name = ('score', tokenize(names[-1], y)) dsk = {name: (accuracy_score, names[-1], y)} return Value(name, [dsk, self.dask.copy()])
def maybe_decode_store(store, lock=False): ds = conventions.decode_cf( store, mask_and_scale=mask_and_scale, decode_times=decode_times, concat_characters=concat_characters, decode_coords=decode_coords, drop_variables=drop_variables) _protect_dataset_variables_inplace(ds, cache) if chunks is not None: from dask.base import tokenize # if passed an actual file path, augment the token with # the file modification time if (isinstance(filename_or_obj, basestring) and not is_remote_uri(filename_or_obj)): mtime = os.path.getmtime(filename_or_obj) else: mtime = None token = tokenize(filename_or_obj, mtime, group, decode_cf, mask_and_scale, decode_times, concat_characters, decode_coords, engine, chunks, drop_variables) name_prefix = 'open_dataset-%s' % token ds2 = ds.chunk(chunks, name_prefix=name_prefix, token=token, lock=lock) ds2._file_obj = ds._file_obj else: ds2 = ds # protect so that dataset store isn't necessarily closed, e.g., # streams like BytesIO can't be reopened # datastore backend is responsible for determining this capability if store._autoclose: store.close() return ds2
def _get_solar_flux_old(self, band): # TODO: this could be replaced with vectorized indexing in the future. from dask.base import tokenize blocksize = CHUNK_SIZE solar_flux = self.cal['solar_flux'].isel(bands=band).values d_index = self.cal['detector_index'].fillna(0).astype(int) shape = d_index.shape vchunks = range(0, shape[0], blocksize) hchunks = range(0, shape[1], blocksize) token = tokenize(band, d_index, solar_flux) name = 'solar_flux_' + token def get_items(array, slices): return solar_flux[d_index[slices].values] dsk = {(name, i, j): (get_items, d_index, (slice(vcs, min(vcs + blocksize, shape[0])), slice(hcs, min(hcs + blocksize, shape[1])))) for i, vcs in enumerate(vchunks) for j, hcs in enumerate(hchunks) } res = da.Array(dsk, name, shape=shape, chunks=(blocksize, blocksize), dtype=solar_flux.dtype) return res
def submit(self, func, *args, **kwargs): """ Submit a function application to the scheduler Parameters ---------- func: callable *args: **kwargs: pure: bool (defaults to True) Whether or not the function is pure. Set ``pure=False`` for impure functions like ``np.random.random``. workers: set, iterable of sets A set of worker hostnames on which computations may be performed. Leave empty to default to all workers (common case) Examples -------- >>> c = executor.submit(add, a, b) # doctest: +SKIP Returns ------- Future See Also -------- distributed.executor.Executor.submit: """ if not callable(func): raise TypeError("First input to submit must be a callable function") key = kwargs.pop('key', None) pure = kwargs.pop('pure', True) workers = kwargs.pop('workers', None) if key is None: if pure: key = funcname(func) + '-' + tokenize(func, kwargs, *args) else: key = funcname(func) + '-' + next(tokens) if key in self.futures: return Future(key, self) if kwargs: task = (apply, func, args, kwargs) else: task = (func,) + args if workers is not None: restrictions = {key: workers} else: restrictions = {} logger.debug("Submit %s(...), %s", funcname(func), key) self.send_to_scheduler({'op': 'update-graph', 'dsk': {key: task}, 'keys': [key], 'restrictions': restrictions}) return Future(key, self)
def build_graph(estimator, cv, scorer, candidate_params, X, y=None, groups=None, fit_params=None, iid=True, refit=True, error_score='raise', return_train_score=True, cache_cv=True): X, y, groups = to_indexable(X, y, groups) cv = check_cv(cv, y, is_classifier(estimator)) # "pairwise" estimators require a different graph for CV splitting is_pairwise = getattr(estimator, '_pairwise', False) dsk = {} X_name, y_name, groups_name = to_keys(dsk, X, y, groups) n_splits = compute_n_splits(cv, X, y, groups) if fit_params: # A mapping of {name: (name, graph-key)} param_values = to_indexable(*fit_params.values(), allow_scalars=True) fit_params = {k: (k, v) for (k, v) in zip(fit_params, to_keys(dsk, *param_values))} else: fit_params = {} fields, tokens, params = normalize_params(candidate_params) main_token = tokenize(normalize_estimator(estimator), fields, params, X_name, y_name, groups_name, fit_params, cv, error_score == 'raise', return_train_score) cv_name = 'cv-split-' + main_token dsk[cv_name] = (cv_split, cv, X_name, y_name, groups_name, is_pairwise, cache_cv) if iid: weights = 'cv-n-samples-' + main_token dsk[weights] = (cv_n_samples, cv_name) else: weights = None scores = do_fit_and_score(dsk, main_token, estimator, cv_name, fields, tokens, params, X_name, y_name, fit_params, n_splits, error_score, scorer, return_train_score) cv_results = 'cv-results-' + main_token candidate_params_name = 'cv-parameters-' + main_token dsk[candidate_params_name] = (decompress_params, fields, params) dsk[cv_results] = (create_cv_results, scores, candidate_params_name, n_splits, error_score, weights) keys = [cv_results] if refit: best_params = 'best-params-' + main_token dsk[best_params] = (get_best_params, candidate_params_name, cv_results) best_estimator = 'best-estimator-' + main_token if fit_params: fit_params = (dict, (zip, list(fit_params.keys()), list(pluck(1, fit_params.values())))) dsk[best_estimator] = (fit_best, clone(estimator), best_params, X_name, y_name, fit_params) keys.append(best_estimator) return dsk, keys, n_splits
def maybe_decode_store(store, lock=False): ds = conventions.decode_cf( store, mask_and_scale=mask_and_scale, decode_times=decode_times, concat_characters=concat_characters, decode_coords=decode_coords, drop_variables=drop_variables) _protect_dataset_variables_inplace(ds, cache) if chunks is not None: from dask.base import tokenize # if passed an actual file path, augment the token with # the file modification time if (isinstance(filename_or_obj, basestring) and not is_remote_uri(filename_or_obj)): mtime = os.path.getmtime(filename_or_obj) else: mtime = None token = tokenize(filename_or_obj, mtime, group, decode_cf, mask_and_scale, decode_times, concat_characters, decode_coords, engine, chunks, drop_variables) name_prefix = 'open_dataset-%s' % token ds2 = ds.chunk(chunks, name_prefix=name_prefix, token=token) ds2._file_obj = ds._file_obj else: ds2 = ds return ds2
def test_tokenize_sequences(): assert tokenize([1]) != tokenize([2]) assert tokenize([1]) != tokenize((1,)) assert tokenize([1]) == tokenize([1]) x = np.arange(2000) # long enough to drop information in repr y = np.arange(2000) y[1000] = 0 # middle isn't printed in repr assert tokenize([x]) != tokenize([y])
def test_tokenize_numpy_array_on_object_dtype(): assert (tokenize(np.array(['a', 'aa', 'aaa'], dtype=object)) == tokenize(np.array(['a', 'aa', 'aaa'], dtype=object))) assert (tokenize(np.array(['a', None, 'aaa'], dtype=object)) == tokenize(np.array(['a', None, 'aaa'], dtype=object))) assert (tokenize(np.array([(1, 'a'), (1, None), (1, 'aaa')], dtype=object)) == tokenize(np.array([(1, 'a'), (1, None), (1, 'aaa')], dtype=object))) if sys.version_info[0] == 2: assert (tokenize(np.array([unicode("Rebeca Alón", encoding="utf-8")], dtype=object)) == tokenize(np.array([unicode("Rebeca Alón", encoding="utf-8")], dtype=object)))
def _scatter(self, data, workers=None, broadcast=False): """ Scatter data to local data dictionary Rather than send data out to the cluster we keep data local. However we do report to the scheduler that the local worker has the scattered data. This allows other workers to come by and steal this data if desired. Keywords like ``broadcast=`` do not work, however operations like ``.replicate`` work fine after calling scatter, which can fill in for this functionality. """ with log_errors(): if not (workers is None and broadcast is False): raise NotImplementedError("Scatter from worker doesn't support workers or broadcast keywords") if isinstance(data, dict) and not all(isinstance(k, (bytes, str)) for k in data): d = yield self._scatter(keymap(tokey, data), workers, broadcast) raise gen.Return({k: d[tokey(k)] for k in data}) if isinstance(data, (list, tuple, set, frozenset)): keys = [] for x in data: try: keys.append(tokenize(x)) except: keys.append(str(uuid.uuid1())) data2 = dict(zip(keys, data)) elif isinstance(data, dict): keys = set(data) data2 = data else: raise TypeError("Don't know how to scatter %s" % type(data)) nbytes = valmap(sizeof, data2) # self.worker.data.update(data2) # thread safety matters self.worker.loop.add_callback(self.worker.data.update, data2) yield self.scheduler.update_data( who_has={key: [self.worker.address] for key in data2}, nbytes=valmap(sizeof, data2), client=self.id) if isinstance(data, dict): out = {k: Future(k, self) for k in data} elif isinstance(data, (tuple, list, set, frozenset)): out = type(data)([Future(k, self) for k in keys]) else: raise TypeError( "Input to scatter must be a list or dict") for key in keys: self.futures[key]['status'] = 'finished' self.futures[key]['event'].set() raise gen.Return(out)
def compute(self, *args, **kwargs): """ Compute dask collections on cluster Parameters ---------- args: iterable of dask objects Collections like dask.array or dataframe or dask.value objects sync: bool (optional) Returns Futures if False (default) or concrete values if True Returns ------- Tuple of Futures or concrete values Examples -------- >>> from dask import do, value >>> from operator import add >>> x = dask.do(add)(1, 2) >>> y = dask.do(add)(x, x) >>> xx, yy = executor.compute(x, y) # doctest: +SKIP >>> xx # doctest: +SKIP <Future: status: finished, key: add-8f6e709446674bad78ea8aeecfee188e> >>> xx.result() # doctest: +SKIP 3 >>> yy.result() # doctest: +SKIP 6 """ sync = kwargs.pop('sync', False) assert not kwargs if sync: return dask.compute(*args, get=self.get) variables = [a for a in args if isinstance(a, Base)] groups = groupby(lambda x: x._optimize, variables) dsk = merge([opt(merge([v.dask for v in val]), [v._keys() for v in val]) for opt, val in groups.items()]) names = ['finalize-%s' % tokenize(v) for v in variables] dsk2 = {name: (v._finalize, v, v._keys()) for name, v in zip(names, variables)} self.loop.add_callback(self.scheduler_queue.put_nowait, {'op': 'update-graph', 'dsk': merge(dsk, dsk2), 'keys': names}) i = 0 futures = [] for arg in args: if isinstance(arg, Base): futures.append(Future(names[i], self)) i += 1 else: futures.append(arg) return futures
def _futures_to_dask_bag(futures, executor=None): executor = default_executor(executor) name = 'bag-from-futures-' + tokenize(*futures) dsk = {(name, i): future for i, future in enumerate(futures)} ensure_default_get(executor) raise gen.Return(db.Bag(dsk, name, len(futures)))
def test_tokenize_pandas_invalid_unicode(): # see https://github.com/dask/dask/issues/2713 df = pd.DataFrame( {"x\ud83d": [1, 2, 3], "y\ud83d": ["4", "asd\ud83d", None]}, index=[1, 2, 3] ) tokenize(df)
def test_tokenize_set(): assert tokenize({1, 2, "x", (1, "x")}) == tokenize({1, 2, "x", (1, "x")})
def test_tokenize_dict(): assert tokenize({"x": 1, 1: "x"}) == tokenize({"x": 1, 1: "x"})
def _construct_collection_plan(cls, dataset_info): # Collect necessary information from dataset_info fs = dataset_info["fs"] parts = dataset_info["parts"] paths = dataset_info["paths"] filters = dataset_info["filters"] pf = dataset_info["pf"] split_row_groups = dataset_info["split_row_groups"] chunksize = dataset_info["chunksize"] gather_statistics = dataset_info["gather_statistics"] base_path = dataset_info["base"] aggregation_depth = dataset_info["aggregation_depth"] index_cols = dataset_info["index_cols"] categories = dataset_info["categories"] dtypes = dataset_info["dtypes"] categories_dict = dataset_info["categories_dict"] has_metadata_file = dataset_info["has_metadata_file"] metadata_task_size = dataset_info["metadata_task_size"] kwargs = dataset_info["kwargs"] # Ensure metadata_task_size is set # (Using config file or defaults) metadata_task_size = _set_metadata_task_size( dataset_info["metadata_task_size"], fs) # Determine which columns need statistics. # At this point, gather_statistics is only True if # the user specified calculate_divisions=True filter_columns = {t[0] for t in flatten(filters or [], container=list)} stat_col_indices = {} _index_cols = index_cols if (gather_statistics and len(index_cols) == 1) else [] for i, name in enumerate(pf.columns): if name in _index_cols or name in filter_columns: stat_col_indices[name] = i # Decide final `gather_statistics` setting. # NOTE: The "fastparquet" engine requires statistics for # filtering even if the filter is on a paritioned column gather_statistics = _set_gather_statistics( gather_statistics, chunksize, split_row_groups, aggregation_depth, filter_columns, set(stat_col_indices) | filter_columns, ) # Define common_kwargs common_kwargs = { "categories": categories_dict or categories, "root_cats": pf.cats, "root_file_scheme": pf.file_scheme, "base_path": base_path, **kwargs, } # Check if this is a very simple case where we can just # return the path names. This requires that `parts` # already be a list of paths. Also, we cannot be splitting # by row-group or collecting statistics. if (gather_statistics is False and not split_row_groups and isinstance(parts, list) and len(parts) and isinstance(parts[0], str)): return ( [{ "piece": (full_path, None) } for full_path in parts], [], common_kwargs, ) dataset_info_kwargs = { "fs": fs, "split_row_groups": split_row_groups, "gather_statistics": gather_statistics, "filters": filters, "dtypes": dtypes, "stat_col_indices": stat_col_indices, "aggregation_depth": aggregation_depth, "chunksize": chunksize, "root_cats": pf.cats, "root_file_scheme": pf.file_scheme, "base_path": "" if base_path is None else base_path, "has_metadata_file": has_metadata_file, } if (has_metadata_file or metadata_task_size == 0 or metadata_task_size > len(paths)): # Construct the output-partitioning plan on the # client process (in serial). This means we have # a global _metadata file, or that `metadata_task_size` # is zero or larger than the number of files. pf_or_paths = pf if has_metadata_file else paths parts, stats = cls._collect_file_parts(pf_or_paths, dataset_info_kwargs) else: # We DON'T have a global _metadata file to work with. # We should loop over files in parallel parts, stats = [], [] if paths: # Build and compute a task graph to construct stats/parts gather_parts_dsk = {} name = "gather-pq-parts-" + tokenize(paths, dataset_info_kwargs) finalize_list = [] for task_i, file_i in enumerate( range(0, len(paths), metadata_task_size)): finalize_list.append((name, task_i)) gather_parts_dsk[finalize_list[-1]] = ( cls._collect_file_parts, paths[file_i:file_i + metadata_task_size], dataset_info_kwargs, ) def _combine_parts(parts_and_stats): parts, stats = [], [] for part, stat in parts_and_stats: parts += part if stat: stats += stat return parts, stats gather_parts_dsk["final-" + name] = (_combine_parts, finalize_list) parts, stats = Delayed("final-" + name, gather_parts_dsk).compute() return parts, stats, common_kwargs
def test_tokenize_pandas_mixed_unicode_bytes(): df = pd.DataFrame( {u"ö".encode("utf8"): [1, 2, 3], u"ö": [u"ö", u"ö".encode("utf8"), None]}, index=[1, 2, 3], ) tokenize(df)
def ukey(self, path): adl_path = self._trim_filename(path) return tokenize(self.info(adl_path)['modificationTime'])
def test_tokenize_object_array_with_nans(): a = np.array(["foo", "Jos\xe9", np.nan], dtype="O") assert tokenize(a) == tokenize(a)
def test_tokenize_numpy_scalar(): assert tokenize(np.array(1.0, dtype="f8")) == tokenize(np.array(1.0, dtype="f8")) assert tokenize( np.array([(1, 2)], dtype=[("a", "i4"), ("b", "i8")])[0] ) == tokenize(np.array([(1, 2)], dtype=[("a", "i4"), ("b", "i8")])[0])
def test_tokenize(): a = (1, 2, 3) assert isinstance(tokenize(a), (str, bytes))
def test_tokenize_literal(): assert tokenize(literal(["x", 1])) == tokenize(literal(["x", 1]))
def test_tokenize_pandas_index(): idx = pd.Index(["a", "b"]) assert tokenize(idx) == tokenize(idx) idx = pd.MultiIndex.from_product([["a", "b"], [0, 1]]) assert tokenize(idx) == tokenize(idx)
def open_rasterio( filename, parse_coordinates=None, chunks=None, cache=None, lock=None, **kwargs, ): """Open a file with rasterio (experimental). This should work with any file that rasterio can open (most often: geoTIFF). The x and y coordinates are generated automatically from the file's geoinformation, shifted to the center of each pixel (see `"PixelIsArea" Raster Space <http://web.archive.org/web/20160326194152/http://remotesensing.org/geotiff/spec/geotiff2.5.html#2.5.2>`_ for more information). You can generate 2D coordinates from the file's attributes with:: >>> from affine import Affine >>> da = xr.open_rasterio( ... "https://github.com/mapbox/rasterio/raw/1.2.1/tests/data/RGB.byte.tif" ... ) >>> da <xarray.DataArray (band: 3, y: 718, x: 791)> [1703814 values with dtype=uint8] Coordinates: * band (band) int64 1 2 3 * y (y) float64 2.827e+06 2.826e+06 2.826e+06 ... 2.612e+06 2.612e+06 * x (x) float64 1.021e+05 1.024e+05 1.027e+05 ... 3.389e+05 3.392e+05 Attributes: transform: (300.0379266750948, 0.0, 101985.0, 0.0, -300.041782729805... crs: +init=epsg:32618 res: (300.0379266750948, 300.041782729805) is_tiled: 0 nodatavals: (0.0, 0.0, 0.0) scales: (1.0, 1.0, 1.0) offsets: (0.0, 0.0, 0.0) AREA_OR_POINT: Area >>> transform = Affine(*da.attrs["transform"]) >>> transform Affine(300.0379266750948, 0.0, 101985.0, 0.0, -300.041782729805, 2826915.0) >>> nx, ny = da.sizes["x"], da.sizes["y"] >>> x, y = transform * np.meshgrid(np.arange(nx) + 0.5, np.arange(ny) + 0.5) >>> x array([[102135.01896334, 102435.05689001, 102735.09481669, ..., 338564.90518331, 338864.94310999, 339164.98103666], [102135.01896334, 102435.05689001, 102735.09481669, ..., 338564.90518331, 338864.94310999, 339164.98103666], [102135.01896334, 102435.05689001, 102735.09481669, ..., 338564.90518331, 338864.94310999, 339164.98103666], ..., [102135.01896334, 102435.05689001, 102735.09481669, ..., 338564.90518331, 338864.94310999, 339164.98103666], [102135.01896334, 102435.05689001, 102735.09481669, ..., 338564.90518331, 338864.94310999, 339164.98103666], [102135.01896334, 102435.05689001, 102735.09481669, ..., 338564.90518331, 338864.94310999, 339164.98103666]]) Parameters ---------- filename : str, rasterio.DatasetReader, or rasterio.WarpedVRT Path to the file to open. Or already open rasterio dataset. parse_coordinates : bool, optional Whether to parse the x and y coordinates out of the file's ``transform`` attribute or not. The default is to automatically parse the coordinates only if they are rectilinear (1D). It can be useful to set ``parse_coordinates=False`` if your files are very large or if you don't need the coordinates. chunks : int, tuple or dict, optional Chunk sizes along each dimension, e.g., ``5``, ``(5, 5)`` or ``{'x': 5, 'y': 5}``. If chunks is provided, it used to load the new DataArray into a dask array. cache : bool, optional If True, cache data loaded from the underlying datastore in memory as NumPy arrays when accessed to avoid reading from the underlying data- store multiple times. Defaults to True unless you specify the `chunks` argument to use dask, in which case it defaults to False. lock : False, True or threading.Lock, optional If chunks is provided, this argument is passed on to :py:func:`dask.array.from_array`. By default, a global lock is used to avoid issues with concurrent access to the same file when using dask's multithreaded backend. Returns ------- data : DataArray The newly created DataArray. """ import rasterio from rasterio.vrt import WarpedVRT vrt_params = None if isinstance(filename, rasterio.io.DatasetReader): filename = filename.name elif isinstance(filename, rasterio.vrt.WarpedVRT): vrt = filename filename = vrt.src_dataset.name vrt_params = dict( src_crs=vrt.src_crs.to_string(), crs=vrt.crs.to_string(), resampling=vrt.resampling, tolerance=vrt.tolerance, src_nodata=vrt.src_nodata, nodata=vrt.nodata, width=vrt.width, height=vrt.height, src_transform=vrt.src_transform, transform=vrt.transform, dtype=vrt.working_dtype, warp_extras=vrt.warp_extras, ) if lock is None: lock = RASTERIO_LOCK manager = CachingFileManager( rasterio.open, filename, lock=lock, mode="r", kwargs=kwargs, ) riods = manager.acquire() if vrt_params is not None: riods = WarpedVRT(riods, **vrt_params) if cache is None: cache = chunks is None coords = {} # Get bands if riods.count < 1: raise ValueError("Unknown dims") coords["band"] = np.asarray(riods.indexes) # Get coordinates if riods.transform.is_rectilinear: # 1d coordinates parse = True if parse_coordinates is None else parse_coordinates if parse: nx, ny = riods.width, riods.height # xarray coordinates are pixel centered x, _ = riods.transform * (np.arange(nx) + 0.5, np.zeros(nx) + 0.5) _, y = riods.transform * (np.zeros(ny) + 0.5, np.arange(ny) + 0.5) coords["y"] = y coords["x"] = x else: # 2d coordinates parse = False if (parse_coordinates is None) else parse_coordinates if parse: warnings.warn( "The file coordinates' transformation isn't " "rectilinear: xarray won't parse the coordinates " "in this case. Set `parse_coordinates=False` to " "suppress this warning.", RuntimeWarning, stacklevel=3, ) # Attributes attrs = {} # Affine transformation matrix (always available) # This describes coefficients mapping pixel coordinates to CRS # For serialization store as tuple of 6 floats, the last row being # always (0, 0, 1) per definition (see # https://github.com/sgillies/affine) attrs["transform"] = tuple(riods.transform)[:6] if hasattr(riods, "crs") and riods.crs: # CRS is a dict-like object specific to rasterio # If CRS is not None, we convert it back to a PROJ4 string using # rasterio itself try: attrs["crs"] = riods.crs.to_proj4() except AttributeError: attrs["crs"] = riods.crs.to_string() if hasattr(riods, "res"): # (width, height) tuple of pixels in units of CRS attrs["res"] = riods.res if hasattr(riods, "is_tiled"): # Is the TIF tiled? (bool) # We cast it to an int for netCDF compatibility attrs["is_tiled"] = np.uint8(riods.is_tiled) if hasattr(riods, "nodatavals"): # The nodata values for the raster bands attrs["nodatavals"] = tuple(np.nan if nodataval is None else nodataval for nodataval in riods.nodatavals) if hasattr(riods, "scales"): # The scale values for the raster bands attrs["scales"] = riods.scales if hasattr(riods, "offsets"): # The offset values for the raster bands attrs["offsets"] = riods.offsets if hasattr(riods, "descriptions") and any(riods.descriptions): # Descriptions for each dataset band attrs["descriptions"] = riods.descriptions if hasattr(riods, "units") and any(riods.units): # A list of units string for each dataset band attrs["units"] = riods.units # Parse extra metadata from tags, if supported parsers = {"ENVI": _parse_envi, "GTiff": lambda m: m} driver = riods.driver if driver in parsers: if driver == "GTiff": meta = parsers[driver](riods.tags()) else: meta = parsers[driver](riods.tags(ns=driver)) for k, v in meta.items(): # Add values as coordinates if they match the band count, # as attributes otherwise if isinstance(v, (list, np.ndarray)) and len(v) == riods.count: coords[k] = ("band", np.asarray(v)) else: attrs[k] = v data = indexing.LazilyIndexedArray( RasterioArrayWrapper(manager, lock, vrt_params)) # this lets you write arrays loaded with rasterio data = indexing.CopyOnWriteArray(data) if cache and chunks is None: data = indexing.MemoryCachedArray(data) result = DataArray(data=data, dims=("band", "y", "x"), coords=coords, attrs=attrs) if chunks is not None: from dask.base import tokenize # augment the token with the file modification time try: mtime = os.path.getmtime(filename) except OSError: # the filename is probably an s3 bucket rather than a regular file mtime = None token = tokenize(filename, mtime, chunks) name_prefix = f"open_rasterio-{token}" result = result.chunk(chunks, name_prefix=name_prefix, token=token) # Make the file closeable result.set_close(manager.close) return result
def test_tokenize_numpy_datetime(): tokenize(np.array(['2000-01-01T12:00:00'], dtype='M8[ns]'))
def test_tokenize_object_with_recursion_error_returns_uuid(): cycle = dict(a=None) cycle["a"] = cycle assert len(tokenize(cycle)) == 32
def test_tokenize_kwargs(): assert tokenize(5, x=1) == tokenize(5, x=1) assert tokenize(5) != tokenize(5, x=1) assert tokenize(5, x=1) != tokenize(5, x=2) assert tokenize(5, x=1) != tokenize(5, y=1)
def test_tokenize_numpy_array_consistent_on_values(): assert tokenize( np.random.RandomState(1234).random_sample(1000)) == tokenize( np.random.RandomState(1234).random_sample(1000))
def test_tokenize_object_array_with_nans(): a = np.array([u'foo', u'Jos\xe9', np.nan], dtype='O') assert tokenize(a) == tokenize(a)
def test_tokenize_dict(): assert tokenize({'x': 1, 1: 'x'}) == tokenize({'x': 1, 1: 'x'})
def test_tokenize_base_types(x): assert tokenize(x) == tokenize(x), x
def test_custom_collection(): # Arbitrary hashables h1 = object() h2 = object() dsk = {("x", h1): 1, ("x", h2): 2} dsk2 = { ("y", h1): (add, ("x", h1), ("x", h2)), ("y", h2): (add, ("y", h1), 1) } dsk2.update(dsk) dsk3 = {"z": (add, ("y", h1), ("y", h2))} dsk3.update(dsk2) w = Tuple({}, []) # A collection can have no keys at all x = Tuple(dsk, [("x", h1), ("x", h2)]) y = Tuple(dsk2, [("y", h1), ("y", h2)]) z = Tuple(dsk3, ["z"]) # Collection with multiple names t = w + x + y + z # __slots__ defined on base mixin class propagates with pytest.raises(AttributeError): x.foo = 1 # is_dask_collection assert is_dask_collection(w) assert is_dask_collection(x) assert is_dask_collection(y) assert is_dask_collection(z) assert is_dask_collection(t) # tokenize assert tokenize(w) == tokenize(w) assert tokenize(x) == tokenize(x) assert tokenize(y) == tokenize(y) assert tokenize(z) == tokenize(z) assert tokenize(t) == tokenize(t) # All tokens are unique assert len({tokenize(coll) for coll in (w, x, y, z, t)}) == 5 # get_collection_names assert get_collection_names(w) == set() assert get_collection_names(x) == {"x"} assert get_collection_names(y) == {"y"} assert get_collection_names(z) == {"z"} assert get_collection_names(t) == {"x", "y", "z"} # compute assert w.compute() == () assert x.compute() == (1, 2) assert y.compute() == (3, 4) assert z.compute() == (7, ) assert dask.compute(w, [{ "x": x }, y, z]) == ((), [{ "x": (1, 2) }, (3, 4), (7, )]) assert t.compute() == (1, 2, 3, 4, 7) # persist t2 = t.persist() assert isinstance(t2, Tuple) assert t2._keys == t._keys assert sorted(t2._dask.values()) == [1, 2, 3, 4, 7] assert t2.compute() == (1, 2, 3, 4, 7) w2, x2, y2, z2 = dask.persist(w, x, y, z) assert y2._keys == y._keys assert y2._dask == {("y", h1): 3, ("y", h2): 4} assert y2.compute() == (3, 4) t3 = x2 + y2 + z2 assert t3.compute() == (1, 2, 3, 4, 7) # __dask_postpersist__ with name change rebuild, args = w.__dask_postpersist__() w3 = rebuild({}, *args, rename={"w": "w3"}) assert w3.compute() == () rebuild, args = x.__dask_postpersist__() x3 = rebuild({("x3", h1): 10, ("x3", h2): 20}, *args, rename={"x": "x3"}) assert x3.compute() == (10, 20) rebuild, args = z.__dask_postpersist__() z3 = rebuild({"z3": 70}, *args, rename={"z": "z3"}) assert z3.compute() == (70, )
def reduction( args, chunk=None, aggregate=None, combine=None, meta=None, token=None, chunk_kwargs=None, aggregate_kwargs=None, combine_kwargs=None, split_every=None, **kwargs, ): """Generic tree reduction operation. Parameters ---------- args : Positional arguments for the `chunk` function. All `dask.dataframe` objects should be partitioned and indexed equivalently. chunk : function [block-per-arg] -> block Function to operate on each block of data aggregate : function list-of-blocks -> block Function to operate on the list of results of chunk combine : function list-of-blocks -> block, optional Function to operate on intermediate lists of results of chunk in a tree-reduction. If not provided, defaults to aggregate. $META token : str, optional The name to use for the output keys. chunk_kwargs : dict, optional Keywords for the chunk function only. aggregate_kwargs : dict, optional Keywords for the aggregate function only. combine_kwargs : dict, optional Keywords for the combine function only. split_every : int, optional Group partitions into groups of this size while performing a tree-reduction. If set to False, no tree-reduction will be used, and all intermediates will be concatenated and passed to ``aggregate``. Default is 8. kwargs : All remaining keywords will be passed to ``chunk``, ``aggregate``, and ``combine``. """ if chunk_kwargs is None: chunk_kwargs = dict() if aggregate_kwargs is None: aggregate_kwargs = dict() chunk_kwargs.update(kwargs) aggregate_kwargs.update(kwargs) if combine is None: if combine_kwargs: raise ValueError("`combine_kwargs` provided with no `combine`") combine = aggregate combine_kwargs = aggregate_kwargs else: if combine_kwargs is None: combine_kwargs = dict() combine_kwargs.update(kwargs) if not isinstance(args, (tuple, list)): args = [args] npartitions = set( arg.npartitions for arg in args if isinstance(arg, _Frame) ) if len(npartitions) > 1: raise ValueError("All arguments must have same number of partitions") npartitions = npartitions.pop() if split_every is None: split_every = 8 elif split_every is False: split_every = npartitions elif split_every < 2 or not isinstance(split_every, int): raise ValueError("split_every must be an integer >= 2") token_key = tokenize( token or (chunk, aggregate), meta, args, chunk_kwargs, aggregate_kwargs, combine_kwargs, split_every, ) # Chunk a = "{0}-chunk-{1}".format(token or funcname(chunk), token_key) if len(args) == 1 and isinstance(args[0], _Frame) and not chunk_kwargs: dsk = { (a, 0, i): (chunk, key) for i, key in enumerate(args[0].__dask_keys__()) } else: dsk = { (a, 0, i): ( apply, chunk, [(x._name, i) if isinstance(x, _Frame) else x for x in args], chunk_kwargs, ) for i in range(args[0].npartitions) } # Combine b = "{0}-combine-{1}".format(token or funcname(combine), token_key) k = npartitions depth = 0 while k > split_every: for part_i, inds in enumerate(partition_all(split_every, range(k))): conc = (list, [(a, depth, i) for i in inds]) dsk[(b, depth + 1, part_i)] = ( (apply, combine, [conc], combine_kwargs) if combine_kwargs else (combine, conc) ) k = part_i + 1 a = b depth += 1 # Aggregate b = "{0}-agg-{1}".format(token or funcname(aggregate), token_key) conc = (list, [(a, depth, i) for i in range(k)]) if aggregate_kwargs: dsk[(b, 0)] = (apply, aggregate, [conc], aggregate_kwargs) else: dsk[(b, 0)] = (aggregate, conc) if meta is None: meta_chunk = _emulate(apply, chunk, args, chunk_kwargs) meta = _emulate(apply, aggregate, [[meta_chunk]], aggregate_kwargs) meta = dd.core.make_meta(meta) graph = HighLevelGraph.from_collections(b, dsk, dependencies=args) return dd.core.new_dd_object(graph, b, meta, (None, None))
def test_tokenize_discontiguous_numpy_array(): tokenize(np.random.random(8)[::2])
def _parallel_var(ddf, meta, skipna, split_every, out): def _local_var(x, skipna): if skipna: n = x.count(skipna=skipna) avg = x.mean(skipna=skipna) else: # Not skipping nulls, so might as well # avoid the full `count` operation n = len(x) avg = x.sum(skipna=skipna) / n m2 = ((x - avg) ** 2).sum(skipna=skipna) return n, avg, m2 def _aggregate_var(parts): n, avg, m2 = parts[0] for i in range(1, len(parts)): n_a, avg_a, m2_a = n, avg, m2 n_b, avg_b, m2_b = parts[i] n = n_a + n_b avg = (n_a * avg_a + n_b * avg_b) / n delta = avg_b - avg_a m2 = m2_a + m2_b + delta ** 2 * n_a * n_b / n return n, avg, m2 def _finalize_var(vals): n, _, m2 = vals return m2 / (n - 1) # Build graph nparts = ddf.npartitions if not split_every: split_every = nparts name = "var-" + tokenize(skipna, split_every, out) local_name = "local-" + name num = ddf._get_numeric_data() dsk = { (local_name, n, 0): (_local_var, (num._name, n), skipna) for n in range(nparts) } # Use reduction tree widths = [nparts] while nparts > 1: nparts = math.ceil(nparts / split_every) widths.append(nparts) height = len(widths) for depth in range(1, height): for group in range(widths[depth]): p_max = widths[depth - 1] lstart = split_every * group lstop = min(lstart + split_every, p_max) node_list = [ (local_name, p, depth - 1) for p in range(lstart, lstop) ] dsk[(local_name, group, depth)] = (_aggregate_var, node_list) if height == 1: group = depth = 0 dsk[(name, 0)] = (_finalize_var, (local_name, group, depth)) graph = HighLevelGraph.from_collections(name, dsk, dependencies=[num, ddf]) result = dd.core.new_dd_object(graph, name, meta, (None, None)) if isinstance(ddf, DataFrame): result.divisions = (min(ddf.columns), max(ddf.columns)) return handle_out(out, result)
def test_tokenize_numpy_datetime(): tokenize(np.array(["2000-01-01T12:00:00"], dtype="M8[ns]"))
def affine_transform(image, matrix, offset=0.0, output_shape=None, order=1, output_chunks=None, **kwargs): """Apply an affine transform using Dask. For every output chunk, only the slice containing the relevant part of the image is processed. Chunkwise processing is performed either using `ndimage.affine_transform` or `cupyx.scipy.ndimage.affine_transform`, depending on the input type. Notes ----- Differences to `ndimage.affine_transformation`: - currently, prefiltering is not supported (affecting the output in case of interpolation `order > 1`) - default order is 1 - modes 'reflect', 'mirror' and 'wrap' are not supported Arguments equal to `ndimage.affine_transformation`, except for `output_chunks`. Parameters ---------- image : array_like (Numpy Array, Cupy Array, Dask Array...) The image array. matrix : array (ndim,), (ndim, ndim), (ndim, ndim+1) or (ndim+1, ndim+1) Transformation matrix. offset : float or sequence, optional The offset into the array where the transform is applied. If a float, `offset` is the same for each axis. If a sequence, `offset` should contain one value for each axis. output_shape : tuple of ints, optional The shape of the array to be returned. order : int, optional The order of the spline interpolation. Note that for order>1 scipy's affine_transform applies prefiltering, which is not yet supported and skipped in this implementation. output_chunks : tuple of ints, optional The shape of the chunks of the output Dask Array. Returns ------- affine_transform : Dask Array A dask array representing the transformed output """ if not type(image) == da.core.Array: image = da.from_array(image) if output_shape is None: output_shape = image.shape if output_chunks is None: output_chunks = image.shape # Perform test run to ensure parameter validity. ndimage_affine_transform(np.zeros([0] * image.ndim), matrix, offset) # Make sure parameters contained in matrix and offset # are not overlapping, i.e. that the offset is valid as # it needs to be modified for each chunk. # Further parameter checks are performed directly by # `ndimage.affine_transform`. matrix = np.asarray(matrix) offset = np.asarray(offset).squeeze() # these lines were copied and adapted from `ndimage.affine_transform` if (matrix.ndim == 2 and matrix.shape[1] == image.ndim + 1 and (matrix.shape[0] in [image.ndim, image.ndim + 1])): # assume input is homogeneous coordinate transformation matrix offset = matrix[:image.ndim, image.ndim] matrix = matrix[:image.ndim, :image.ndim] # process kwargs # prefilter is not yet supported if 'prefilter' in kwargs: if kwargs['prefilter'] and order > 1: warnings.warn( 'Currently, `dask_image.ndinterp.affine_transform` ' 'doesn\'t support `prefilter=True`. Proceeding with' ' `prefilter=False`, which if order > 1 can lead ' 'to the output containing more blur than with ' 'prefiltering.', UserWarning) del kwargs['prefilter'] if 'mode' in kwargs: if kwargs['mode'] in ['wrap', 'reflect', 'mirror']: raise (NotImplementedError("Mode %s is not currently supported." % kwargs['mode'])) n = image.ndim image_shape = image.shape # calculate output array properties normalized_chunks = da.core.normalize_chunks(output_chunks, tuple(output_shape)) block_indices = product(*(range(len(bds)) for bds in normalized_chunks)) block_offsets = [np.cumsum((0, ) + bds[:-1]) for bds in normalized_chunks] # use dispatching mechanism to determine backend affine_transform_method = dispatch_affine_transform(image) asarray_method = dispatch_asarray(image) # construct dask graph for output array # using unique and deterministic identifier output_name = 'affine_transform-' + tokenize( image, matrix, offset, output_shape, output_chunks, kwargs) output_layer = {} rel_images = [] for ib, block_ind in enumerate(block_indices): out_chunk_shape = [ normalized_chunks[dim][block_ind[dim]] for dim in range(n) ] out_chunk_offset = [ block_offsets[dim][block_ind[dim]] for dim in range(n) ] out_chunk_edges = np.array([i for i in np.ndindex(tuple([2] * n))])\ * np.array(out_chunk_shape) + np.array(out_chunk_offset) # map output chunk edges onto input image coordinates # to define the input region relevant for the current chunk if matrix.ndim == 1 and len(matrix) == image.ndim: rel_image_edges = matrix * out_chunk_edges + offset else: rel_image_edges = np.dot(matrix, out_chunk_edges.T).T + offset rel_image_i = np.min(rel_image_edges, 0) rel_image_f = np.max(rel_image_edges, 0) # Calculate edge coordinates required for the footprint of the # spline kernel according to # https://github.com/scipy/scipy/blob/9c0d08d7d11fc33311a96d2ac3ad73c8f6e3df00/scipy/ndimage/src/ni_interpolation.c#L412-L419 # noqa: E501 # Also see this discussion: # https://github.com/dask/dask-image/issues/24#issuecomment-706165593 # noqa: E501 for dim in range(n): if order % 2 == 0: rel_image_i[dim] += 0.5 rel_image_f[dim] += 0.5 rel_image_i[dim] = np.floor(rel_image_i[dim]) - order // 2 rel_image_f[dim] = np.floor(rel_image_f[dim]) - order // 2 + order if order == 0: # required for consistency with scipy.ndimage rel_image_i[dim] -= 1 # clip image coordinates to image extent for dim, s in zip(range(n), image_shape): rel_image_i[dim] = np.clip(rel_image_i[dim], 0, s - 1) rel_image_f[dim] = np.clip(rel_image_f[dim], 0, s - 1) rel_image_slice = tuple([ slice(int(rel_image_i[dim]), int(rel_image_f[dim]) + 2) for dim in range(n) ]) rel_image = image[rel_image_slice] """Block comment for future developers explaining how `offset` is transformed into `offset_prime` for each output chunk. Modify offset to point into cropped image. y = Mx + o Coordinate substitution: y' = y - y0(min_coord_px) x' = x - x0(chunk_offset) Then: y' = Mx' + o + Mx0 - y0 M' = M o' = o + Mx0 - y0 """ offset_prime = offset + np.dot(matrix, out_chunk_offset) - rel_image_i output_layer[(output_name, ) + block_ind] = ( affine_transform_method, (da.core.concatenate3, rel_image.__dask_keys__()), asarray_method(matrix), offset_prime, tuple(out_chunk_shape), # output_shape None, # out order, 'constant' if 'mode' not in kwargs else kwargs['mode'], 0. if 'cval' not in kwargs else kwargs['cval'], False # prefilter ) rel_images.append(rel_image) graph = HighLevelGraph.from_collections(output_name, output_layer, dependencies=[image] + rel_images) meta = dispatch_asarray(image)([0]).astype(image.dtype) transformed = da.Array( graph, output_name, shape=tuple(output_shape), # chunks=output_chunks, chunks=normalized_chunks, meta=meta) return transformed
def open_rasterio(filename, chunks=None, cache=None, lock=None): """Open a file with rasterio (experimental). This should work with any file that rasterio can open (most often: geoTIFF). The x and y coordinates are generated automatically from the file's geoinformation, shifted to the center of each pixel (see `"PixelIsArea" Raster Space <http://web.archive.org/web/20160326194152/http://remotesensing.org/geotiff/spec/geotiff2.5.html#2.5.2>`_ for more information). Parameters ---------- filename : str Path to the file to open. Returns ------- data : DataArray The newly created DataArray. chunks : int, tuple or dict, optional Chunk sizes along each dimension, e.g., ``5``, ``(5, 5)`` or ``{'x': 5, 'y': 5}``. If chunks is provided, it used to load the new DataArray into a dask array. cache : bool, optional If True, cache data loaded from the underlying datastore in memory as NumPy arrays when accessed to avoid reading from the underlying data- store multiple times. Defaults to True unless you specify the `chunks` argument to use dask, in which case it defaults to False. lock : False, True or threading.Lock, optional If chunks is provided, this argument is passed on to :py:func:`dask.array.from_array`. By default, a global lock is used to avoid issues with concurrent access to the same file when using dask's multithreaded backend. """ import rasterio riods = rasterio.open(filename, mode='r') if cache is None: cache = chunks is None coords = OrderedDict() # Get bands if riods.count < 1: raise ValueError('Unknown dims') coords['band'] = np.asarray(riods.indexes) # Get geo coords nx, ny = riods.width, riods.height dx, dy = riods.res[0], -riods.res[1] x0 = riods.bounds.right if dx < 0 else riods.bounds.left y0 = riods.bounds.top if dy < 0 else riods.bounds.bottom coords['y'] = np.linspace(start=y0 + dy / 2, num=ny, stop=(y0 + (ny - 1) * dy) + dy / 2) coords['x'] = np.linspace(start=x0 + dx / 2, num=nx, stop=(x0 + (nx - 1) * dx) + dx / 2) # Attributes attrs = {} if hasattr(riods, 'crs') and riods.crs: # CRS is a dict-like object specific to rasterio # If CRS is not None, we convert it back to a PROJ4 string using # rasterio itself attrs['crs'] = riods.crs.to_string() if hasattr(riods, 'res'): # (width, height) tuple of pixels in units of CRS attrs['res'] = riods.res if hasattr(riods, 'is_tiled'): # Is the TIF tiled? (bool) # We cast it to an int for netCDF compatibility attrs['is_tiled'] = np.uint8(riods.is_tiled) if hasattr(riods, 'transform'): # Affine transformation matrix (tuple of floats) # Describes coefficients mapping pixel coordinates to CRS attrs['transform'] = tuple(riods.transform) data = indexing.LazilyIndexedArray(RasterioArrayWrapper(riods)) # this lets you write arrays loaded with rasterio data = indexing.CopyOnWriteArray(data) if cache and (chunks is None): data = indexing.MemoryCachedArray(data) result = DataArray(data=data, dims=('band', 'y', 'x'), coords=coords, attrs=attrs) if chunks is not None: from dask.base import tokenize # augment the token with the file modification time mtime = os.path.getmtime(filename) token = tokenize(filename, mtime, chunks) name_prefix = 'open_rasterio-%s' % token if lock is None: lock = RASTERIO_LOCK result = result.chunk(chunks, name_prefix=name_prefix, token=token, lock=lock) # Make the file closeable result._file_obj = riods return result
def read_orc(path, columns=None, storage_options=None, **kwargs): """Read cudf dataframe from ORC file(s). Note that this function is mostly borrowed from upstream Dask. Parameters ---------- path: str or list(str) Location of file(s), which can be a full URL with protocol specifier, and may include glob character if a single string. columns: None or list(str) Columns to load. If None, loads all. storage_options: None or dict Further parameters to pass to the bytes backend. Returns ------- cudf.DataFrame """ storage_options = storage_options or {} fs, fs_token, paths = get_fs_token_paths(path, mode="rb", storage_options=storage_options) schema = None nstripes_per_file = [] for path in paths: with fs.open(path, "rb") as f: o = orc.ORCFile(f) if schema is None: schema = o.schema elif schema != o.schema: raise ValueError( "Incompatible schemas while parsing ORC files") nstripes_per_file.append(o.nstripes) schema = _get_pyarrow_dtypes(schema, categories=None) if columns is not None: ex = set(columns) - set(schema) if ex: raise ValueError("Requested columns (%s) not in schema (%s)" % (ex, set(schema))) else: columns = list(schema) with fs.open(paths[0], "rb") as f: meta = cudf.read_orc(f, stripes=[0], columns=columns, **kwargs) name = "read-orc-" + tokenize(fs_token, path, columns, **kwargs) dsk = {} N = 0 for path, n in zip(paths, nstripes_per_file): for stripe in range(n): dsk[(name, N)] = ( _read_orc_stripe, fs, path, stripe, columns, kwargs, ) N += 1 divisions = [None] * (len(dsk) + 1) return dd.core.new_dd_object(dsk, name, meta, divisions)
def groupby_agg( ddf, gb_cols, aggs_in, split_every=None, split_out=None, dropna=True, sep="___", sort=False, as_index=True, ): """ Optimized groupby aggregation for Dask-CuDF. This aggregation algorithm only supports the following options: - "count" - "mean" - "std" - "var" - "sum" - "min" - "max" - "collect" - "first" - "last" This "optimized" approach is more performant than the algorithm in `dask.dataframe`, because it allows the cudf backend to perform multiple aggregations at once. """ # Deal with default split_out and split_every params if split_every is False: split_every = ddf.npartitions split_every = split_every or 8 split_out = split_out or 1 # Standardize `gb_cols` and `columns` lists aggs = _redirect_aggs(aggs_in.copy()) if isinstance(gb_cols, str): gb_cols = [gb_cols] columns = [c for c in ddf.columns if c not in gb_cols] str_cols_out = False if isinstance(aggs, dict): # Use `str_cols_out` to specify if the output columns # will have str (rather than MultiIndex/tuple) names. # This happens when all values in the `aggs` dict are # strings (no lists) str_cols_out = True for col in aggs: if isinstance(aggs[col], str) or callable(aggs[col]): aggs[col] = [aggs[col]] else: str_cols_out = False if col in gb_cols: columns.append(col) # Assert that aggregations are supported _supported = { "count", "mean", "std", "var", "sum", "min", "max", "collect", "first", "last", } if not _is_supported(aggs, _supported): raise ValueError( f"Supported aggs include {_supported} for groupby_agg API. " f"Aggregations must be specified with dict or list syntax." ) # Always convert aggs to dict for consistency if isinstance(aggs, list): aggs = {col: aggs for col in columns} # Begin graph construction dsk = {} token = tokenize(ddf, gb_cols, aggs) partition_agg_name = "groupby_partition_agg-" + token tree_reduce_name = "groupby_tree_reduce-" + token gb_agg_name = "groupby_agg-" + token for p in range(ddf.npartitions): # Perform groupby aggregation on each partition. # Split each result into `split_out` chunks (by hashing `gb_cols`) dsk[(partition_agg_name, p)] = ( _groupby_partition_agg, (ddf._name, p), gb_cols, aggs, columns, split_out, dropna, sort, sep, ) # Pick out each chunk using `getitem` for s in range(split_out): dsk[(tree_reduce_name, p, s, 0)] = ( getitem, (partition_agg_name, p), s, ) # Build reduction tree parts = ddf.npartitions widths = [parts] while parts > 1: parts = math.ceil(parts / split_every) widths.append(parts) height = len(widths) for s in range(split_out): for depth in range(1, height): for group in range(widths[depth]): p_max = widths[depth - 1] lstart = split_every * group lstop = min(lstart + split_every, p_max) node_list = [ (tree_reduce_name, p, s, depth - 1) for p in range(lstart, lstop) ] dsk[(tree_reduce_name, group, s, depth)] = ( _tree_node_agg, node_list, gb_cols, split_out, dropna, sort, sep, ) # Final output partitions. _aggs = aggs.copy() if str_cols_out: # Metadata should use `str` for dict values if that is # what the user originally specified (column names will # be str, rather than tuples). for col in aggs: _aggs[col] = _aggs[col][0] _meta = ddf._meta.groupby(gb_cols, as_index=as_index).agg(_aggs) for s in range(split_out): dsk[(gb_agg_name, s)] = ( _finalize_gb_agg, (tree_reduce_name, 0, s, height - 1), gb_cols, aggs, columns, _meta.columns, as_index, sort, sep, str_cols_out, ) divisions = [None] * (split_out + 1) graph = HighLevelGraph.from_collections( gb_agg_name, dsk, dependencies=[ddf] ) return new_dd_object(graph, gb_agg_name, _meta, divisions)
def test_tokenize_numpy_array_supports_uneven_sizes(): tokenize(np.random.random(7).astype(dtype="i2"))
def test_tokenize_set(): assert tokenize({1, 2, 'x', (1, 'x')}) == tokenize({1, 2, 'x', (1, 'x')})