def wrapper(*args, **kwargs): environs = [{'name': k, 'value': v} for k, v in envs.items()] payload = { 'cpus': str(cpus), 'async': bool(async), 'mem': str(mem), 'disabled': bool(disabled), 'retries': int(retries), 'uris': list(uris), 'environmentVariables': environs } if schedule: payload['schedule'] = str(schedule) elif parents: payload['parents'] = list(parents) if docker: payload['container'] = { 'type': 'DOCKER', 'image': str(docker), 'forcePullImage': bool(force_pull) } cid = '{}-{}'.format(name or fn.__name__, tokenize(*args, **kwargs)) payload['name'] = cid payload['command'] = command(fn, args, kwargs, path=path) schedule_job(host=host, payload=payload) return cid
def mesos(obj, name=None, pure=True, cpus=1, mem=64, disk=0, docker='lensa/dask.mesos', force_pull=False, envs={}, uris=[], **kwargs): kwargs['resources'] = [Cpus(cpus), Mem(mem), Disk(disk)] kwargs['docker'] = docker kwargs['force_pull'] = force_pull kwargs['envs'] = envs kwargs['uris'] = uris if isinstance(obj, MesosDelayed): return obj task, dasks, params = to_task_dasks_params(obj) if not dasks: return MesosDelayedLeaf(obj, pure=pure, name=name, **kwargs) else: if not name: name = '%s-%s' % (type(obj).__name__, tokenize(task, pure=pure)) dasks.append({name: task}) params.append({name: kwargs}) return MesosDelayed(name, dasks, params)
def __call__(self, *args, **kwargs): params = to_params(args) dask_key_name = kwargs.pop('dask_key_name', None) pure = kwargs.pop('pure', self.pure) if dask_key_name is None: name = (funcname(self._data) + '-' + tokenize(self._key, *args, pure=pure, **kwargs)) else: name = dask_key_name args, dasks, params = unzip(map(to_task_dasks_params, args), 3) if kwargs: dask_kwargs, dasks2, params2 = to_task_dasks_params(kwargs) params = params + (params2, ) dasks = dasks + (dasks2, ) task = (apply, self._data, list(args), dask_kwargs) else: task = (self._data, ) + args dasks = flat_unique(dasks) dasks.append({name: task}) params = flat_unique(params) params.append({name: self.params[self._key]}) return MesosDelayed(name, dasks, params)
def dask_hist2d(x: da.Array, y: da.Array, bins: int, range, density=False): if x.shape != y.shape: raise ValueError( f"Mismatch in argument shaoes: x.shape == {x.shape}; y.shape == {y.shape}" ) token = tokenize(x, y, bins, range, density) name = "histogram2d-sum-" + token x_keys = flatten(x.__dask_keys__()) y_keys = flatten(y.__dask_keys__()) dsk = { (name, i, 0, 0): (_block_fast_hist2d, xi, yi, bins, range) for i, (xi, yi) in enumerate(zip(x_keys, y_keys)) } dtype = np.histogram2d([], [])[0].dtype graph = HighLevelGraph.from_collections(name, dsk, dependencies=(x, y)) # turn graph into a 3D array of shape (nchunks, nbins, nbins) nchunks = len(list(flatten(x.__dask_keys__()))) chunks = ((1,) * nchunks, (bins,), (bins,)) mapped = Array(graph, name, chunks, dtype=dtype) n = mapped.sum(axis=0) return n
def wrapper(*args, **kwargs): payload = { 'cpus': float(cpus), 'mem': float(mem), 'instances': int(instances), 'env': dict(envs), 'uris': list(uris) } if docker: payload['container'] = { 'docker': { 'image': str(docker), 'forcePullImage': bool(force_pull) } } if volumes: payload['container']['volumes'] = _parse_volumes(volumes) mid = '{}-{}'.format(name or fn.__name__, tokenize(*args, **kwargs)) payload['id'] = mid payload['cmd'] = command(fn, args, kwargs, path=path) start(host=host, payload=payload) return mid
def to_sql( df, name: str, uri: str, schema=None, if_exists: str = "fail", index: bool = True, index_label=None, chunksize=None, dtype=None, method=None, compute=True, parallel=False, engine_kwargs=None, ): """Store Dask Dataframe to a SQL table An empty table is created based on the "meta" DataFrame (and conforming to the caller's "if_exists" preference), and then each block calls pd.DataFrame.to_sql (with `if_exists="append"`). Databases supported by SQLAlchemy [1]_ are supported. Tables can be newly created, appended to, or overwritten. Parameters ---------- name : str Name of SQL table. uri : string Full sqlalchemy URI for the database connection schema : str, optional Specify the schema (if database flavor supports this). If None, use default schema. if_exists : {'fail', 'replace', 'append'}, default 'fail' How to behave if the table already exists. * fail: Raise a ValueError. * replace: Drop the table before inserting new values. * append: Insert new values to the existing table. index : bool, default True Write DataFrame index as a column. Uses `index_label` as the column name in the table. index_label : str or sequence, default None Column label for index column(s). If None is given (default) and `index` is True, then the index names are used. A sequence should be given if the DataFrame uses MultiIndex. chunksize : int, optional Specify the number of rows in each batch to be written at a time. By default, all rows will be written at once. dtype : dict or scalar, optional Specifying the datatype for columns. If a dictionary is used, the keys should be the column names and the values should be the SQLAlchemy types or strings for the sqlite3 legacy mode. If a scalar is provided, it will be applied to all columns. method : {None, 'multi', callable}, optional Controls the SQL insertion clause used: * None : Uses standard SQL ``INSERT`` clause (one per row). * 'multi': Pass multiple values in a single ``INSERT`` clause. * callable with signature ``(pd_table, conn, keys, data_iter)``. Details and a sample callable implementation can be found in the section :ref:`insert method <io.sql.method>`. compute : bool, default True When true, call dask.compute and perform the load into SQL; otherwise, return a Dask object (or array of per-block objects when parallel=True) parallel : bool, default False When true, have each block append itself to the DB table concurrently. This can result in DB rows being in a different order than the source DataFrame's corresponding rows. When false, load each block into the SQL DB in sequence. engine_kwargs : dict or None Specific db engine parameters for sqlalchemy Raises ------ ValueError When the table already exists and `if_exists` is 'fail' (the default). See Also -------- read_sql : Read a DataFrame from a table. Notes ----- Timezone aware datetime columns will be written as ``Timestamp with timezone`` type with SQLAlchemy if supported by the database. Otherwise, the datetimes will be stored as timezone unaware timestamps local to the original timezone. .. versionadded:: 0.24.0 References ---------- .. [1] https://docs.sqlalchemy.org .. [2] https://www.python.org/dev/peps/pep-0249/ Examples -------- Create a table from scratch with 4 rows. >>> import pandas as pd >>> df = pd.DataFrame([ {'i':i, 's':str(i)*2 } for i in range(4) ]) >>> from dask.dataframe import from_pandas >>> ddf = from_pandas(df, npartitions=2) >>> ddf # doctest: +SKIP Dask DataFrame Structure: i s npartitions=2 0 int64 object 2 ... ... 3 ... ... Dask Name: from_pandas, 2 tasks >>> from dask.utils import tmpfile >>> from sqlalchemy import create_engine >>> with tmpfile() as f: ... db = 'sqlite:///%s' %f ... ddf.to_sql('test', db) ... engine = create_engine(db, echo=False) ... result = engine.execute("SELECT * FROM test").fetchall() >>> result [(0, 0, '00'), (1, 1, '11'), (2, 2, '22'), (3, 3, '33')] """ if not isinstance(uri, str): raise ValueError(f"Expected URI to be a string, got {type(uri)}.") # This is the only argument we add on top of what Pandas supports kwargs = dict( name=name, uri=uri, engine_kwargs=engine_kwargs, schema=schema, if_exists=if_exists, index=index, index_label=index_label, chunksize=chunksize, dtype=dtype, method=method, ) meta_task = delayed(_to_sql_chunk)(df._meta, **kwargs) # Partitions should always append to the empty table created from `meta` above worker_kwargs = dict(kwargs, if_exists="append") if parallel: # Perform the meta insert, then one task that inserts all blocks concurrently: result = [ _extra_deps( _to_sql_chunk, d, extras=meta_task, **worker_kwargs, dask_key_name="to_sql-%s" % tokenize(d, **worker_kwargs), ) for d in df.to_delayed() ] else: # Chain the "meta" insert and each block's insert result = [] last = meta_task for d in df.to_delayed(): result.append( _extra_deps( _to_sql_chunk, d, extras=last, **worker_kwargs, dask_key_name="to_sql-%s" % tokenize(d, **worker_kwargs), )) last = result[-1] result = dask.delayed(result) if compute: dask.compute(result) else: return result
def dask_hist1d( a: Array, bins=None, range=None, normed=False, weights=None, density=None ): """ Blocked variant of :func:`numpy.histogram`, but using the fast-histogram module. Parameters ---------- a : array_like Input data. The histogram is computed over the flattened array. bins : int or sequence of scalars, optional Either an iterable specifying the ``bins`` or the number of ``bins`` and a ``range`` argument is required as computing ``min`` and ``max`` over blocked arrays is an expensive operation that must be performed explicitly. If `bins` is an int, it defines the number of equal-width bins in the given range (10, by default). If `bins` is a sequence, it defines a monotonically increasing array of bin edges, including the rightmost edge, allowing for non-uniform bin widths. range : (float, float), optional The lower and upper range of the bins. If not provided, range is simply ``(a.min(), a.max())``. Values outside the range are ignored. The first element of the range must be less than or equal to the second. `range` affects the automatic bin computation as well. While bin width is computed to be optimal based on the actual data within `range`, the bin count will fill the entire range including portions containing no data. normed : bool, optional This is equivalent to the ``density`` argument, but produces incorrect results for unequal bin widths. It should not be used. weights : array_like, optional A dask.array.Array of weights, of the same block structure as ``a``. Each value in ``a`` only contributes its associated weight towards the bin count (instead of 1). If ``density`` is True, the weights are normalized, so that the integral of the density over the range remains 1. density : bool, optional If ``False``, the result will contain the number of samples in each bin. If ``True``, the result is the value of the probability *density* function at the bin, normalized such that the *integral* over the range is 1. Note that the sum of the histogram values will not be equal to 1 unless bins of unity width are chosen; it is not a probability *mass* function. Overrides the ``normed`` keyword if given. If ``density`` is True, ``bins`` cannot be a single-number delayed value. It must be a concrete number, or a (possibly-delayed) array/sequence of the bin edges. Returns ------- hist : dask Array The values of the histogram. See `density` and `weights` for a description of the possible semantics. bin_edges : dask Array of dtype float Return the bin edges ``(length(hist)+1)``. Examples -------- Using number of bins and range: >>> import dask.array as da >>> import numpy as np >>> x = da.from_array(np.arange(10000), chunks=10) >>> h, bins = da.histogram(x, bins=10, range=[0, 10000]) >>> bins array([ 0., 1000., 2000., 3000., 4000., 5000., 6000., 7000., 8000., 9000., 10000.]) >>> h.compute() array([1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000]) Explicitly specifying the bins: >>> h, bins = da.histogram(x, bins=np.array([0, 5000, 10000])) >>> bins array([ 0, 5000, 10000]) >>> h.compute() array([5000, 5000]) """ if isinstance(bins, Array): scalar_bins = bins.ndim == 0 # ^ `np.ndim` is not implemented by Dask array. elif isinstance(bins, Delayed): scalar_bins = bins._length is None or bins._length == 1 else: scalar_bins = np.ndim(bins) == 0 if bins is None or (scalar_bins and range is None): raise ValueError( "dask.array.histogram requires either specifying " "bins as an iterable or specifying both a range and " "the number of bins" ) if weights is not None and weights.chunks != a.chunks: raise ValueError("Input array and weights must have the same chunked structure") if normed is not False: raise ValueError( "The normed= keyword argument has been deprecated. " "Please use density instead. " "See the numpy.histogram docstring for more information." ) if density and scalar_bins and isinstance(bins, (Array, Delayed)): raise NotImplementedError( "When `density` is True, `bins` cannot be a scalar Dask object. " "It must be a concrete number or a (possibly-delayed) array/sequence of bin edges." ) for argname, val in [("bins", bins), ("range", range), ("weights", weights)]: if not isinstance(bins, (Array, Delayed)) and is_dask_collection(bins): raise TypeError( "Dask types besides Array and Delayed are not supported " "for `histogram`. For argument `{}`, got: {!r}".format(argname, val) ) if range is not None: try: if len(range) != 2: raise ValueError( f"range must be a sequence or array of length 2, but got {len(range)} items" ) if isinstance(range, (Array, np.ndarray)) and range.shape != (2,): raise ValueError( f"range must be a 1-dimensional array of two items, but got an array of shape {range.shape}" ) except TypeError: raise TypeError( f"Expected a sequence or array for range, not {range}" ) from None token = tokenize(a, bins, range, weights, density) name = "histogram-sum-" + token if scalar_bins: bins = _linspace_from_delayed(range[0], range[1], bins + 1) # ^ NOTE `range[1]` is safe because of the above check, and the initial check # that range must not be None if `scalar_bins` else: if not isinstance(bins, (Array, np.ndarray)): bins = asarray(bins) if bins.ndim != 1: raise ValueError( f"bins must be a 1-dimensional array or sequence, got shape {bins.shape}" ) (bins_ref, range_ref), deps = unpack_collections([bins, range]) # Map the histogram to all bins, forming a 2D array of histograms, stacked for each chunk if weights is None: dsk = { (name, i, 0): (_block_fast_hist1d, k, bins_ref, range_ref) for i, k in enumerate(flatten(a.__dask_keys__())) } dtype = np.histogram([])[0].dtype else: a_keys = flatten(a.__dask_keys__()) w_keys = flatten(weights.__dask_keys__()) dsk = { (name, i, 0): (_block_fast_hist1d, k, bins_ref, range_ref, w) for i, (k, w) in enumerate(zip(a_keys, w_keys)) } dtype = weights.dtype deps = (a,) + deps if weights is not None: deps += (weights,) graph = HighLevelGraph.from_collections(name, dsk, dependencies=deps) # Turn graph into a 2D Array of shape (nchunks, nbins) nchunks = len(list(flatten(a.__dask_keys__()))) nbins = bins.size - 1 # since `bins` is 1D chunks = ((1,) * nchunks, (nbins,)) mapped = Array(graph, name, chunks, dtype=dtype) # Sum over chunks to get the final histogram n = mapped.sum(axis=0) # We need to replicate normed and density options from numpy if density is not None: if density: db = asarray(np.diff(bins).astype(float), chunks=n.chunks) return n / db / n.sum(), bins else: return n, bins else: return n, bins