def compute_stepsize_dask(beta, step, Xbeta, Xstep, y, curr_val, family=Logistic, stepSize=1.0, armijoMult=0.1, backtrackMult=0.1): loglike = family.loglike beta, step, Xbeta, Xstep, y, curr_val = persist(beta, step, Xbeta, Xstep, y, curr_val) obeta, oXbeta = beta, Xbeta (step, ) = compute(step) steplen = (step**2).sum() lf = curr_val func = 0 for ii in range(100): beta = obeta - stepSize * step if ii and (beta == obeta).all(): stepSize = 0 break Xbeta = oXbeta - stepSize * Xstep func = loglike(Xbeta, y) Xbeta, func = persist(Xbeta, func) df = lf - compute(func)[0] if df >= armijoMult * stepSize * steplen: break stepSize *= backtrackMult return stepSize, beta, Xbeta, func
def scatter_data_to_workers(self): self.scatteredDataFutures = None if self.client is not None: if not self.cpuFlag: print('scattering data to GPU workers...', end='') self.scatteredDataFutures = self.client.scatter([ self.dataset.trainData, self.dataset.trainLabels, self.dataset.testData, self.dataset.testLabels ], broadcast=True) else: print('scattering data to CPU workers...', end='') self.scatteredDataFutures = self.client.scatter( [ self.dataset.cpuDataset['trainData'], self.dataset.cpuDataset['trainLabels'], self.dataset.cpuDataset['testData'], self.dataset.cpuDataset['testLabels'] ], broadcast=False ) # there is no need to broadcast between CPU workers [ ? ] print('done scatter') print(' + persisting scattered data...', end='') persist(self.scatteredDataFutures) print('done persist') else: assert ('error: missing a dask client/cluster!')
def compute_stepsize_dask(beta, step, Xbeta, Xstep, y, curr_val, family=Logistic, stepSize=1.0, armijoMult=0.1, backtrackMult=0.1): """Compute the optimal stepsize Parameters ---------- beta : array-like step : float XBeta : array-like Xstep : float y : array-like curr_val : float famlily : Family, optional stepSize : float, optional armijoMult : float, optional backtrackMult : float, optional Returns ------- stepSize : float beta : array-like xBeta : array-like func : callable """ loglike = family.loglike beta, step, Xbeta, Xstep, y, curr_val = persist(beta, step, Xbeta, Xstep, y, curr_val) obeta, oXbeta = beta, Xbeta (step, ) = compute(step) steplen = (step**2).sum() lf = curr_val func = 0 for ii in range(100): beta = obeta - stepSize * step if ii and (beta == obeta).all(): stepSize = 0 break Xbeta = oXbeta - stepSize * Xstep func = loglike(Xbeta, y) Xbeta, func = persist(Xbeta, func) df = lf - compute(func)[0] if df >= armijoMult * stepSize * steplen: break stepSize *= backtrackMult return stepSize, beta, Xbeta, func
async def test_persist(c, s, a, b): x = delayed(inc)(1) (x2, ) = persist(x) await wait(x2) assert x2.key in a.data or x2.key in b.data y = delayed(inc)(10) y2, one = persist(y, 1) await wait(y2) assert y2.key in a.data or y2.key in b.data
def test_persist(c, s, a, b): x = delayed(inc)(1) x2, = persist(x) yield wait(x2) assert x2.key in a.data or x2.key in b.data y = delayed(inc)(10) y2, one = persist(y, 1) yield wait(y2) assert y2.key in a.data or y2.key in b.data
def test_persist(c, s, a, b): x = delayed(inc)(1) x2, = persist(x) yield _wait(x2) assert x2.key in a.data or x2.key in b.data y = delayed(inc)(10) y2, one = persist(y, 1) yield _wait(y2) assert y2.key in a.data or y2.key in b.data
def __init__(self, xyz, topology, time=None, delayed_objects=None, **kwargs): dask.persist(**kwargs) self._unitcell_vectors = None super(Trajectory, self).__init__(xyz=xyz, topology=topology, time=time, **kwargs)
def test_repeated_persists_same_priority(c, s, w): xs = [delayed(slowinc)(i, delay=0.05, dask_key_name='x-%d' % i) for i in range(10)] ys = [delayed(slowinc)(x, delay=0.05, dask_key_name='y-%d' % i) for i, x in enumerate(xs)] zs = [delayed(slowdec)(x, delay=0.05, dask_key_name='z-%d' % i) for i, x in enumerate(xs)] ys = dask.persist(*ys) zs = dask.persist(*zs) while sum(t.state == 'memory' for t in s.tasks.values()) < 5: # TODO: reduce this number yield gen.sleep(0.01) assert any(s.tasks[y.key].state == 'memory' for y in ys) assert any(s.tasks[z.key].state == 'memory' for z in zs)
def apply_func(self, func, varname, *args, **kwargs): """ Apply the function to each block of data (doesn't use xarray) See here: http://dask.pydata.org/en/latest/delayed-best-practices.html """ @delayed def load_single_nc(ncfile, varname): with Dataset(ncfile) as nc: # Load the data X = nc.variables[varname][:] X[np.isnan(X)] = 0. return X @delayed def lazy_func(func, X, *args, **kwargs): return func(X, *args, **kwargs) def f(func, ncfiles, varname, *args, **kwargs): output = [] for ncfile in ncfiles: X = load_single_nc(ncfile, varname) output.append(lazy_func(func, X, *args, **kwargs)) return output stack = dask.persist(f(func, self.filenames, varname, *args, **kwargs)) return np.concatenate([ii.compute() for ii in stack[0]], axis=-1)[...,self.ghost]
def main(args=None): args = parse_args(args) steps = range(args.start, args.stop, args.step) if args.scheduler_address: client = Client(args.scheduler_address) info = client.scheduler_info() logger.info("Distributed mode: %s", client.scheduler) logger.info("Dashboard: %s:%s", info['address'], info['services']['bokeh']) else: logger.warning("Local mode") logger.info("Fitting for %s", list(steps)) logger.info("Reading data") X = read().pipe(transform).pipe(as_array) X, = persist(X) timings = [] for n_clusters in range(args.start, args.stop, args.step): logger.info("Starting %02d", n_clusters) t0 = tic() km = do(X, n_clusters, factor=args.factor) t1 = tic() logger.info("Finished %02d, [%.2f]", n_clusters, t1 - t0) logger.info("Cluster Centers [%s]:\n%s", n_clusters, km.cluster_centers_) inertia = km.inertia_.compute() logger.info("Inertia [%s]: %s", km.cluster_centers_, inertia) timings.append((n_clusters, args.factor, t1 - t0, inertia)) pd.DataFrame(timings, columns=['n_clusters', 'factor', 'time', 'inertia']).to_csv('timings.csv')
def cleaning(self): cols = [ 'Year', 'Month', 'DayOfWeek', 'Distance', 'DepDelay', 'CRSDepTime', 'UniqueCarrier', 'Origin', 'Dest' ] # Create the dataframe df = dd.read_csv(sorted( glob(os.path.join('data', 'nycflights', '*.csv'))), usecols=cols, storage_options={'anon': True}) df = df.sample(frac=0.2) # we blow out ram otherwise label = (df.DepDelay.fillna(16) > 15) df['CRSDepTime'] = df['CRSDepTime'].clip(upper=2399) del df['DepDelay'] df, label = persist(df, label) df2 = dd.get_dummies(df.categorize()).persist() X_train, X_test = df2.random_split([0.9, 0.1], random_state=1234) y_train, y_test = label.random_split([0.9, 0.1], random_state=1234) return X_train, X_test, y_train, y_test
def fit(self, X, y=None): self._reset() to_persist = OrderedDict() feature_range = self.feature_range if feature_range[0] >= feature_range[1]: raise ValueError("Minimum of desired feature " "range must be smaller than maximum.") _X = slice_columns(X, self.columns) data_min = _X.min(0) data_max = _X.max(0) data_range = data_max - data_min scale = ((feature_range[1] - feature_range[0]) / handle_zeros_in_scale(data_range)) to_persist["data_min_"] = data_min to_persist["data_max_"] = data_max to_persist["data_range_"] = data_range to_persist["scale_"] = scale to_persist["min_"] = feature_range[0] - data_min * scale to_persist["n_samples_seen_"] = np.nan values = persist(*to_persist.values()) for k, v in zip(to_persist, values): setattr(self, k, v) return self
def _to_ds24(self, X, y=None, *, batch_size, shuffle, drop_remainder): def to_spec(name, dtype, idx): if dtype is not None: spec = tf.TensorSpec(shape=(None, len(idx)), dtype=dtype) else: # var len v = X[name].head(1).tolist()[0] spec = tf.TensorSpec(shape=(None, len(v)), dtype='int32') return spec meta = self._get_meta(X) sig = {k: to_spec(k, dtype, idx) for k, (dtype, idx) in meta.items()} if y is not None: if isinstance(y, dd.Series): y = y.to_dask_array(lengths=True) if self.task == consts.TASK_MULTICLASS: y = self._to_categorical(y, num_classes=self.num_classes) sig = sig, tf.TensorSpec(shape=(None, self.num_classes), dtype=y.dtype) else: sig = sig, tf.TensorSpec(shape=(None, ), dtype=y.dtype) X = X.to_dask_array(lengths=True) X, y = dask.persist(X, y) gen = partial(self._generate, meta, X, y, batch_size=batch_size, shuffle=shuffle, drop_remainder=drop_remainder) ds = tf.data.Dataset.from_generator(gen, output_signature=sig) return ds
def main(args=None): args = parse_args(args) client = Client(args.scheduler_address) # noqa if args.scheduler_address.startswith("ucx://"): setup() client.run_on_scheduler(setup) client.run(setup) n_keys = args.n_keys n_rows_l = args.left_rows n_rows_r = args.left_rows gleft, gright = make_data(n_keys, n_rows_l, n_rows_r) t0 = clock() gleft, gright = dask.persist(gleft, gright) wait([gleft, gright]) print('left :', gleft) print('right :', gright) t1 = clock() print("Persist :", t1 - t0) out = gleft.merge(gright, on=['id']) t2 = clock() result = out.compute() t3 = clock() print("Schedule:", t2 - t1) print("Compute :", t3 - t2) print("Total :", t3 - t0) print(type(result)) print(result.head())
def cg_project(A, x, y, tol=1e-8, **options): r""" Project (x, y) onto graph G = {(y, x) | y = Ax} via CG In particular, form outputs as: :math:`x_{out} = (1 + A^TA)^{-1}(A^Ty + x)` :math:`y_{out} = Ax_{out}` """ fmt = 'array {} compatible' assert A.shape[0] == y.shape[0] and A.shape[1] == x.shape[0], fmt.format( 'dims') assert A.chunks[0] == y.chunks[0] and A.chunks[1] == x.chunks[ 0], fmt.format('chunks') token = options.pop( 'name', 'cg-project-' + dask.base.tokenize(A, x, y, tol, **options)) nm_b, nm_x, nm_y = map(lambda nm: nm + '-' + token, ('b', 'x', 'y')) # b = A'y + x b = atoms2.gemv(1, A, y, 1, x, transpose=True, name=nm_b) A_hat = linop.DLORegularizedGram(A, transpose=False) x_out, res, iters = cg.cg_graph(A_hat, b, tol=tol, name=nm_x, **options) y_out = atoms2.dot(A, x_out, name=nm_y) x_out, y_out = dask.persist(x_out, y_out) return x_out, y_out, res, iters
def _fit_parallel( self, convert_to_inference: bool = False, sampler_args: dict = None, ) -> Union[List[CmdStanMCMC], List[az.InferenceData]]: """Fit model by parallelizing across features. :param convert_to_inference: Whether to create individual InferenceData objects for individual feature fits, defaults to False :type convert_to_inference: bool :param sampler_args: Additional parameters to pass to CmdStanPy sampler (optional) :type sampler_args: dict """ if sampler_args is None: sampler_args = dict() _fits = [] for v, i, d in self.table.iter(axis="observation"): _fit = dask.delayed(self._fit_single)( v, sampler_args, convert_to_inference, ) _fits.append(_fit) fit_futures = dask.persist(*_fits) all_fits = dask.compute(fit_futures)[0] # Set data back to full table self.dat["y"] = self.table.matrix_data.todense().T.astype(int) self.fit = all_fits
def missing_impact(df: dd.DataFrame, bins: int) -> Intermediate: """ Calculate the data for visualizing the plot_missing(df). This contains the missing spectrum, missing bar chart and missing heatmap. """ cols = df.columns.values (nulldf, ) = dask.persist(df.isnull()) nullity = nulldf.to_dask_array(lengths=True) null_perc = nullity.sum(axis=0) / nullity.shape[0] tasks = ( missing_spectrum(nullity, cols, bins=bins), null_perc, missing_bars(null_perc, cols), missing_heatmap(nulldf, null_perc, cols), missing_dendrogram(nullity, cols), ) spectrum, null_perc, bars, heatmap, dendrogram = dd.compute(*tasks) return Intermediate( data_total_missing={ col: null_perc[idx] for idx, col in enumerate(cols) }, data_spectrum=spectrum, data_bars=bars, data_heatmap=heatmap, data_dendrogram=dendrogram, visual_type="missing_impact", )
def prepare_data(): # Choose columns to use cols = [ 'Year', 'Month', 'DayOfWeek', 'Distance', 'DepDelay', 'CRSDepTime', 'UniqueCarrier', 'Origin', 'Dest' ] df = dd.read_csv(os.path.join('data', 'nycflights', '*.csv'), usecols=cols, storage_options={'anon': True}) is_delayed = (df.DepDelay.fillna(16) > 15) # Remove delay information from training dataframe del df['DepDelay'] # Trim all the values in data df['CRSDepTime'] = df['CRSDepTime'].clip(upper=2399) # df: data from which we will learn if flights are delayed # is_delayed: whether or not those flights were delayed df, is_delayed = dask.persist(df, is_delayed) # Convert categorical data into numerical df_numerical = dd.get_dummies(df.categorize()).persist() print("- Done") return df_numerical, is_delayed
def run(self): self._validate_setup() write_locks = {} for times in self._times: filename = self._get_output_filename(times) self.setup_netcdf_output(filename, times) write_locks[filename] = combine_locks( [NETCDFC_LOCK, get_write_lock(filename)]) self.logger.info('Starting {} chunks...'.format(len(self.slices))) delayed_objs = [ wrap_run_slice(self.params, write_locks, dslice) for dslice in self.slices ] persisted = dask.persist(delayed_objs, num_workers=self.params['num_workers']) self.progress_bar(persisted) dask.compute(persisted) self.logger.info('Cleaning up...') try: self._client.cluster.close() self._client.close() if self.params['verbose'] == logging.DEBUG: print() print('closed dask cluster/client') except Exception: pass
def main(args=None): args = parse_args(args) steps = range(args.start, args.stop, args.step) if args.scheduler_address: client = Client(args.scheduler_address) info = client.scheduler_info() logger.info("Distributed mode: %s", client.scheduler) logger.info("Dashboard: %s:%s", info["address"], info["services"]["bokeh"]) else: logger.warning("Local mode") logger.info("Fitting for %s", list(steps)) logger.info("Reading data") X = read().pipe(transform).pipe(as_array) X, = persist(X) timings = [] for n_clusters in range(args.start, args.stop, args.step): logger.info("Starting %02d", n_clusters) t0 = tic() with _timer(n_clusters, _logger=logger): km = do(X, n_clusters, factor=args.factor) t1 = tic() logger.info("Cluster Centers [%s]:\n%s", n_clusters, km.cluster_centers_) inertia = km.inertia_.compute() logger.info("Inertia [%s]: %s", km.cluster_centers_, inertia) timings.append((n_clusters, args.factor, t1 - t0, inertia)) pd.DataFrame(timings, columns=["n_clusters", "factor", "time", "inertia"]).to_csv("timings.csv")
def _evaluate_datasets(pipelines, datasets, hyperparameters, metrics, distributed, test_split, detrend): delayed = [] for dataset, signals in datasets.items(): LOGGER.info("Starting dataset {} with {} signals..".format( dataset, len(signals))) # dataset configuration hyperparameters_ = _get_parameter(hyperparameters, dataset) parameters = _get_parameter(BENCHMARK_PARAMS, dataset) if parameters is not None: detrend, test_split = parameters.values() result = _evaluate_pipelines(pipelines, dataset, signals, hyperparameters_, metrics, distributed, test_split, detrend) delayed.extend(result) if distributed: persisted = dask.persist(*delayed) results = dask.compute(*persisted) else: results = delayed df = pd.DataFrame.from_records(results) return df
def from_dask(df: "dask.DataFrame") -> Dataset[ArrowRow]: """Create a dataset from a Dask DataFrame. Args: df: A Dask DataFrame. Returns: Dataset holding Arrow records read from the DataFrame. """ import dask from ray.util.dask import ray_dask_get partitions = df.to_delayed() persisted_partitions = dask.persist(*partitions, scheduler=ray_dask_get) import pandas def to_ref(df): if isinstance(df, pandas.DataFrame): return ray.put(df) elif isinstance(df, ray.ObjectRef): return df else: raise ValueError( "Expected a Ray object ref or a Pandas DataFrame, " f"got {type(df)}") return from_pandas_refs([ to_ref(next(iter(part.dask.values()))) for part in persisted_partitions ])
def _transform(self, ds, do_fit=False, method_name=None): for i, block in enumerate(self.graph): if block.dataset_map is not None: try: ds = block.dataset_map(ds) except Exception as e: raise RuntimeError( f"Could not map ds {ds}\n with {block.dataset_map}" ) from e continue if do_fit: args = _get_dask_args_from_ds(ds, block.fit_input) args = [d for d, dims in args] estimator = block.estimator if is_estimator_stateless(estimator): block.estimator_ = estimator elif block.model_path is not None and os.path.isfile(block.model_path): _load_estimator.__name__ = f"load_{block.estimator_name}" block.estimator_ = dask.delayed(_load_estimator)(block) elif block.input_dask_array: ds = ds.persist() args = _get_dask_args_from_ds(ds, block.fit_input) args = [d for d, dims in args] block.estimator_ = _fit(*args, block=block) else: _fit.__name__ = f"{block.estimator_name}.fit" block.estimator_ = dask.delayed(_fit)( *args, block=block, ) mn = "transform" if i == len(self.graph) - 1: if do_fit: break mn = method_name if block.features_dir is None: args = _get_dask_args_from_ds(ds, block.transform_input) dims, data = _blockwise_with_block( args, block, mn, input_has_keys=False ) else: dims, data = _transform_or_load(block, ds, block.transform_input, mn) # replace data inside dataset ds = ds.copy(deep=False) del ds["data"] persisted = False if not np.all(np.isfinite(data.shape)): block.estimator_, data = dask.persist(block.estimator_, data) data = data.compute_chunk_sizes() persisted = True ds["data"] = (dims, data) if persisted: ds = ds.persist() return ds
def gradient_descent(X, y, max_steps=100, tol=1e-14, family=Logistic): '''Michael Grant's implementation of Gradient Descent.''' loglike, gradient = family.loglike, family.gradient n, p = X.shape firstBacktrackMult = 0.1 nextBacktrackMult = 0.5 armijoMult = 0.1 stepGrowth = 1.25 stepSize = 1.0 recalcRate = 10 backtrackMult = firstBacktrackMult beta = np.zeros(p) for k in range(max_steps): # how necessary is this recalculation? if k % recalcRate == 0: Xbeta = X.dot(beta) func = loglike(Xbeta, y) grad = gradient(Xbeta, X, y) Xgradient = X.dot(grad) # backtracking line search lf = func stepSize, _, _, func = compute_stepsize_dask( beta, grad, Xbeta, Xgradient, y, func, family=family, backtrackMult=backtrackMult, armijoMult=armijoMult, stepSize=stepSize) beta, stepSize, Xbeta, lf, func, grad, Xgradient = persist( beta, stepSize, Xbeta, lf, func, grad, Xgradient) stepSize, lf, func, grad = compute(stepSize, lf, func, grad) beta = beta - stepSize * grad # tiny bit of repeat work here to avoid communication Xbeta = Xbeta - stepSize * Xgradient if stepSize == 0: print('No more progress') break df = lf - func df /= max(func, lf) if df < tol: print('Converged') break stepSize *= stepGrowth backtrackMult = nextBacktrackMult return beta
def test_persist_nested(c): a = delayed(1) + 5 b = a + 1 c = a + 2 result = persist({"a": a, "b": [1, 2, b]}, (c, 2), 4, [5]) assert isinstance(result[0]["a"], Delayed) assert isinstance(result[0]["b"][2], Delayed) assert isinstance(result[1][0], Delayed) sol = ({"a": 6, "b": [1, 2, 7]}, (8, 2), 4, [5]) assert compute(*result) == sol res = persist([a, b], c, 4, [5], traverse=False) assert res[0][0] is a assert res[0][1] is b assert res[1].compute() == 8 assert res[2:] == (4, [5])
def test_expand_persist(c, s, a, b): low = delayed(inc)(1, dask_key_name='low') many = [delayed(slowinc)(i, delay=0.1) for i in range(4)] high = delayed(inc)(2, dask_key_name='high') low, high, x, y, z, w = persist(low, high, *many, priority={low: -1, high: 1}) yield wait(high) assert s.tasks[low.key].state == 'processing'
def benchmark(tuners, challenges, iterations, detailed_output=False): """Score ``tuners`` against a list of ``challenges`` for the given amount of iterations. This function scores a collection of ``tuners`` against a collection of ``challenges`` performing tuning iterations in order to obtain a better score. At the end, the best score for each tuner / challenge is being returned. This data is returned as a ``pandas.DataFrame``. Args: tuners (dict): Python dictionary with the ``name`` of the function as ``key`` and the callable function that returns the best score for a given ``scorer``. This function must have three arguments: * scorer (function): A function that performs scoring over params. * tunable (btb.tuning.Tunable): A ``Tunable`` instance used to instantiate a tuner. * iterations (int): Number of tuning iterations to perform. challenges (list): A list of ``chalenges``. This challenges must inherit from ``btb.challenges.challenge.Challenge``. iterations (int): Amount of tuning iterations to perform for each tuner and each challenge. detailed_output (bool): If ``True`` a dataframe with the elapsed time, score and iterations will be returned. Returns: pandas.DataFrame: A ``pandas.DataFrame`` with the obtained scores for the given challenges is being returned. """ delayed = [] for challenge in challenges: result = _evaluate_tuners_on_challenge(tuners, challenge, iterations) delayed.extend(result) persisted = dask.persist(*delayed) try: progress(persisted) except ValueError: # Using local client. No progress bar needed. pass results = dask.compute(*persisted) df = pd.DataFrame.from_records(results) if detailed_output: return df df = df.pivot(index='challenge', columns='tuner', values='score') del df.columns.name del df.index.name return df
def test_dataset_dask(ms): datasets = read_datasets(ms, [], [], []) assert len(datasets) == 1 ds = datasets[0] # All dask arrays for k, v in ds.data_vars.items(): assert isinstance(v.data, da.Array) # Test variable compute v2 = dask.compute(v)[0] assert isinstance(v2, xr.DataArray if have_xarray else Variable) assert isinstance(v2.data, np.ndarray) # Test variable persists v3 = dask.persist(v)[0] assert isinstance(v3, xr.DataArray if have_xarray else Variable) # Now have numpy array in the graph assert len(v3.data.__dask_keys__()) == 1 data = next(iter(v3.__dask_graph__().values())) assert isinstance(data, np.ndarray) assert_array_equal(v2.data, v3.data) # Test compute nds = dask.compute(ds)[0] for k, v in nds.data_vars.items(): assert isinstance(v.data, np.ndarray) cdata = getattr(ds, k).data assert_array_equal(cdata, v.data) # Test persist nds = dask.persist(ds)[0] for k, v in nds.data_vars.items(): assert isinstance(v.data, da.Array) # Now have numpy array iin the graph assert len(v.data.__dask_keys__()) == 1 data = next(iter(v.data.__dask_graph__().values())) assert isinstance(data, np.ndarray) cdata = getattr(ds, k).data assert_array_equal(cdata, v.data)
def cg_initialize(A, b, x_init=None): if x_init is None: x = 0 * b else: x = 1 * x_init r = A.dot(x) - b p = 1 * r x, r, p = dask.persist(x, r, p) return x, r, p
def test_future(self): """compute_with_trace() can handle Futures.""" client = Client(processes=False) self.addCleanup(client.shutdown) [bag] = dask.persist(from_sequence([1, 2, 3])) bag = bag.map(lambda x: x * 5) result = dask.compute(bag) self.assertEqual(result, ([5, 10, 15],)) self.assertEqual(result, compute_with_trace(bag))
def test_admm_with_large_lamduh(N, p, nchunks): X = da.random.random((N, p), chunks=(N // nchunks, p)) beta = np.random.random(p) y = make_y(X, beta=np.array(beta), chunks=(N // nchunks, )) X, y = persist(X, y) z = admm(X, y, reg=L1, lamduh=1e4, rho=20, max_iter=500) assert np.allclose(z, np.zeros(p), atol=1e-4)
def get_array_moments( array: da.core.Array, mean: bool = True, std: bool = True, std_method: str = 'binom', axis: int = 0 ) -> Tuple[Optional[da.core.Array], Optional[da.core.Array]]: """ Computes specified array_moments Parameters ---------- array : array_like, shape (N, P) Array that moments will be computed from mean : bool Flag whether to compute mean of "array" along "axis" std : bool Flag whether to compute std of "array" along "axis" std_method : str Method used to compute standard deviation. Possible methods are: 'norm' ===> Normal Distribution Standard Deviation. See np.std 'binom' ====> Binomial Standard Deviation sqrt(2*p*(1-p)), where p = "mean"/2 axis : int Axis to compute mean and std along. Returns ------- array_mean : da.core.array, optional If "mean" is false, returns None Otherwise returns the array mean array_std: da.core.array, optional If "std" is false, returns None Otherwise returns the array std """ array_mean = None array_std = None if mean: array_mean = da.nanmean(array, axis=axis) if std: if std_method == 'binom': u = array_mean if mean else da.nanmean(array, axis=axis) u /= 2 array_std = da.sqrt(2 * u * (1 - u)) elif std_method == 'norm': array_std = da.nanstd(array, axis=axis) else: raise NotImplementedError( f'std_method, {std_method}, is not implemented ') array_mean, array_std = persist(array_mean, array_std) return array_mean, array_std
def test_persist_nested(loop): with cluster() as (s, [a, b]): with Client(s['address'], loop=loop): a = delayed(1) + 5 b = a + 1 c = a + 2 result = persist({'a': a, 'b': [1, 2, b]}, (c, 2), 4, [5]) assert isinstance(result[0]['a'], Delayed) assert isinstance(result[0]['b'][2], Delayed) assert isinstance(result[1][0], Delayed) sol = ({'a': 6, 'b': [1, 2, 7]}, (8, 2), 4, [5]) assert compute(*result) == sol res = persist([a, b], c, 4, [5], traverse=False) assert res[0][0] is a assert res[0][1] is b assert res[1].compute() == 8 assert res[2:] == (4, [5])
async def test_annotate_persist(c, s, a, b): with dask.annotate(priority=-1): low = delayed(inc)(1, dask_key_name="low") with dask.annotate(priority=1): high = delayed(inc)(2, dask_key_name="high") many = [delayed(slowinc)(i, delay=0.1) for i in range(4)] low, high, x, y, z, w = persist(low, high, *many, optimize_graph=False) await wait(high) assert s.tasks[low.key].state == "processing"
def test_dont_recompute_if_persisted_4(c, s, a, b): x = delayed(inc)(1, dask_key_name='x') y = delayed(inc)(x, dask_key_name='y') z = delayed(inc)(x, dask_key_name='z') yy = y.persist() yield wait(yy) old = s.story('x') while s.tasks['x'].state == 'memory': yield gen.sleep(0.01) yyy, zzz = dask.persist(y, z) yield wait([yyy, zzz]) new = s.story('x') assert len(new) > len(old)
def test_custom_collection(): dsk = {'a': 1, 'b': 2} dsk2 = {'c': (add, 'a', 'b'), 'd': (add, 'c', 1)} dsk2.update(dsk) dsk3 = {'e': (add, 'a', 4), 'f': (inc, 'e')} dsk3.update(dsk) x = Tuple(dsk, ['a', 'b']) y = Tuple(dsk2, ['c', 'd']) z = Tuple(dsk3, ['e', 'f']) # __slots__ defined on base mixin class propogates with pytest.raises(AttributeError): x.foo = 1 # is_dask_collection assert is_dask_collection(x) # tokenize assert tokenize(x) == tokenize(x) assert tokenize(x) != tokenize(y) # compute assert x.compute() == (1, 2) assert dask.compute(x, [y, z]) == ((1, 2), [(3, 4), (5, 6)]) t = x + y + z assert t.compute() == (1, 2, 3, 4, 5, 6) # persist t2 = t.persist() assert isinstance(t2, Tuple) assert t2._dask == dict(zip('abcdef', range(1, 7))) assert t2.compute() == (1, 2, 3, 4, 5, 6) x2, y2, z2 = dask.persist(x, y, z) t3 = x2 + y2 + z2 assert t2._dask == t3._dask