def build_cli(args=None): if not args: args = parse_args() else: args = parse_args(args) filter_dirty = any(args.packages) or not args._all outputs = get_dask_outputs(args.path, packages=args.packages, filter_dirty=filter_dirty, git_rev=args.git_rev, stop_rev=args.stop_rev, steps=args.steps, max_downstream=args.max_downstream, visualize=args.visualize, test=args.test) if args.visualize: # setattr(nx.drawing, 'graphviz_layout', nx.nx_pydot.graphviz_layout) # graphviz_graph = nx.draw_graphviz(graph, 'dot') # graphviz_graph.draw(args.visualize) visualize(*outputs, filename=args.visualize) # create neat looking graph. else: # many threads, because this is just the dispatch. Takes very little compute. # Only waiting for build complete. cluster = LocalCluster(n_workers=1, threads_per_worker=args.threads, nanny=False) client = Client(cluster) futures = client.persist(outputs) progress(futures)
def test_visualize_lists(tmpdir): pytest.importorskip('graphviz') fn = os.path.join(str(tmpdir), 'myfile.dot') dask.visualize([{'abc-xyz': (add, 1, 2)}], filename=fn) with open(fn) as f: text = f.read() assert 'abc-xyz' in text
def test_visualize_lists(tmpdir): pytest.importorskip("graphviz") fn = os.path.join(str(tmpdir), "myfile.dot") dask.visualize([{"abc-xyz": (add, 1, 2)}], filename=fn) with open(fn) as f: text = f.read() assert "abc-xyz" in text
def exc(self): """ """ computations = [dask.delayed(self.series)( code=c['code'], institution=c['institution'], region=c['region']) for c in self.institutions.to_dict(orient='records')] dask.visualize(computations, filename='highcharts', format='pdf') dask.compute(computations, scheduler='processes') self.inspect()
def __init__(self, root_dir, png_path=None): self.session_dirs = list(find_session_dirs((root_dir,))) d = {} for session_dir in self.session_dirs: session_dir = str(session_dir) for task in missing_tasks(session_dir): d[('task_name', task.name)] = task.name dependencies = [(dt, session_dir) for dt in task.depends_on] d[(task.name, session_dir)] = ( run_task, ('task_name', task.name), session_dir, dependencies) d[('end', session_dir)] = ( _count, [(task_name, session_dir) for task_name in TASK_CLASSES.keys()]) if png_path: visualize(d, filename=png_path) self.graph = d self.create_cluster()
def visualize(self, filename='mydask', format=None, **kwargs): """Render the task graph for this parameter search using ``graphviz``. Requires ``graphviz`` to be installed. Parameters ---------- filename : str or None, optional The name (without an extension) of the file to write to disk. If `filename` is None, no file will be written, and we communicate with dot using only pipes. format : {'png', 'pdf', 'dot', 'svg', 'jpeg', 'jpg'}, optional Format in which to write output file. Default is 'png'. **kwargs Additional keyword arguments to forward to ``dask.dot.to_graphviz``. Returns ------- result : IPython.diplay.Image, IPython.display.SVG, or None See ``dask.dot.dot_graph`` for more information. """ check_is_fitted(self, 'dask_graph_') return dask.visualize(self.dask_graph_, filename=filename, format=format, **kwargs)
def visualize(self, filename, **kwargs): """Generate an image describing the reporting structure. This is a shorthand for :meth:`dask.visualize`. Requires `graphviz <https://pypi.org/project/graphviz/>`__. """ return dask.visualize(self.graph, filename=filename, **kwargs)
def exc(self): """ :return: """ calculations = [] for model in self.models: measures = self.measures_(model=model) densities = self.densities_(model=model) properties = self.properties_(model=model) calculations.append(self.concatenate(measures, densities, properties)) dask.visualize(calculations, filename='calculations', format='pdf') values = dask.compute(calculations, scheduler='processes')[0] return pd.concat(values, ignore_index=True)
def main(args): start_date = datetime.strptime(str(args.start_date), "%Y%m%d") start_date = start_date.replace(hour=args.time_of_day) num_days = args.num_days dates = [start_date + timedelta(i) for i in range(-2, num_days)] source_path = args.source_file res = get_urls(dates, source_path) if args.visualize: dask.visualize(*res, filename='get_article_graph.svg') else: urlfiles = dask.compute(*res) urlfiles = sorted(urlfiles) rng = list(range(len(urlfiles)))[2:] _ = [pruneLinks(urlfiles[i - 2:i + 1]) for i in rng] print("\n\nDone!\n\n")
def visualize( self, filename: str = "mydask", format: str | None = None, optimize_graph: bool = False, **kwargs: Any, ) -> DisplayObject | None: return dask.visualize( self, filename=filename, format=format, optimize_graph=optimize_graph, **kwargs, )
def visualize(dsk, df_tasks, label="", color="", current_time=0, **kwargs): """ Draw a dask graph enhanced by additional information. Parameters ---------- dsk: dict Dask task graph. Should be able to be plotted by dask.visualize. df_tasks: pd.DataFrame DataFrame of the dask task stream data. "key" column is mandatory to assign a row of the DataFrame to a node in the graph. "key" column must be of type string even when key is a tuple, because otherwise the type is not compatible with formats like parquet. label: str Column name of df_tasks DataFrame which contains the value for the node label. color: str Column name of df_tasks DataFrame which contains color information of node fill color. If the values are numerical the node is filled with grayscale tones. The label font color is adjusted to be always readable. If the values are strings each unique value is assigned a different color. If the value is "progress" each started node is filled with red and each finished is filled with blue. To set the current time use the argument "current_time". The option needs the columns "start_delta" and "stop_delta" in the df_tasks DataFrame containing the seconds passed since the start of the graph execution. current_time: float If color is set to "progress" this sets the current time influencing the fill color of the nodes. """ attributes = _get_dsk_attributes( dsk, df_tasks, label_col=label, color_col=color, current_time=current_time ) return dask.visualize( dsk, data_attributes=attributes["data"], function_attributes=attributes["func"], **kwargs, )
def visualize_one_block(dataset, **kwargs): """ Visualize one block of a Dataset or DataArray. """ graph = None if isinstance(dataset, xr.DataArray): dataset = dataset._to_temp_dataset() keys = [] block = get_one_block(dataset.unify_chunks()) graph = block.__dask_graph__() for name, variable in block.variables.items(): if isinstance(variable.data, dask.array.Array): key = (variable.data.name,) + (0,) * variable.ndim keys.append(key) if graph is None: raise ValueError("No dask variables!") return dask.visualize(graph.cull(set(keys)), **kwargs)
def visualize(self, filename='mydask', format=None, **kwargs): """Render the task graph for this parameter search using ``graphviz``. Requires ``graphviz`` to be installed. Parameters ---------- filename : str or None, optional The name (without an extension) of the file to write to disk. If `filename` is None, no file will be written, and we communicate with dot using only pipes. format : {'png', 'pdf', 'dot', 'svg', 'jpeg', 'jpg'}, optional Format in which to write output file. Default is 'png'. **kwargs Additional keyword arguments to forward to ``dask.dot.to_graphviz``. Returns ------- result : IPython.diplay.Image, IPython.display.SVG, or None See ``dask.dot.dot_graph`` for more information. """ check_is_fitted(self, 'dask_graph_') return dask.visualize(self.dask_graph_, filename=filename, format=format, **kwargs)
def plot_dask(self, filename): visualize(self.as_dict(), filename=filename, collapse_outputs=True)
def _clean(**kw): args = OmegaConf.create(kw) OmegaConf.set_struct(args, True) import numpy as np import numexpr as ne import dask import dask.array as da from dask.distributed import performance_report from pfb.utils.fits import load_fits, set_wcs, save_fits, data_from_header from pfb.opt.hogbom import hogbom from astropy.io import fits print("Loading dirty", file=log) dirty = load_fits(args.dirty, dtype=args.output_type).squeeze() nband, nx, ny = dirty.shape hdr = fits.getheader(args.dirty) print("Loading psf", file=log) psf = load_fits(args.psf, dtype=args.output_type).squeeze() _, nx_psf, ny_psf = psf.shape hdr_psf = fits.getheader(args.psf) wsums = np.amax(psf.reshape(-1, nx_psf * ny_psf), axis=1) wsum = np.sum(wsums) psf /= wsum psf_mfs = np.sum(psf, axis=0) assert (psf_mfs.max() - 1.0) < 1e-4 dirty /= wsum dirty_mfs = np.sum(dirty, axis=0) # get info required to set WCS ra = np.deg2rad(hdr['CRVAL1']) dec = np.deg2rad(hdr['CRVAL2']) radec = [ra, dec] cell_deg = np.abs(hdr['CDELT1']) if cell_deg != np.abs(hdr['CDELT2']): raise NotImplementedError('cell sizes have to be equal') cell_rad = np.deg2rad(cell_deg) freq_out, ref_freq = data_from_header(hdr, axis=3) hdr_mfs = set_wcs(cell_deg, cell_deg, nx, ny, radec, ref_freq) save_fits(args.output_filename + '_dirty_mfs.fits', dirty_mfs, hdr_mfs, dtype=args.output_type) # set up Hessian approximation if args.weight_table is not None: normfact = wsum from africanus.gridding.wgridder.dask import hessian from pfb.utils.misc import plan_row_chunk from daskms.experimental.zarr import xds_from_zarr xds = xds_from_zarr(args.weight_table)[0] nrow = xds.row.size freqs = xds.chan.data nchan = freqs.size # bin edges fmin = freqs.min() fmax = freqs.max() fbins = np.linspace(fmin, fmax, nband + 1) # chan <-> band mapping band_mapping = {} chan_chunks = {} freq_bin_idx = {} freq_bin_counts = {} band_map = np.zeros(freqs.size, dtype=np.int32) for band in range(nband): indl = freqs >= fbins[band] indu = freqs < fbins[band + 1] + 1e-6 band_map = np.where(indl & indu, band, band_map) # to dask arrays bands, bin_counts = np.unique(band_map, return_counts=True) band_mapping = tuple(bands) chan_chunks = {'chan': tuple(bin_counts)} freqs = da.from_array(freqs, chunks=tuple(bin_counts)) bin_idx = np.append(np.array([0]), np.cumsum(bin_counts))[0:-1] freq_bin_idx = da.from_array(bin_idx, chunks=1) freq_bin_counts = da.from_array(bin_counts, chunks=1) max_chan_chunk = bin_counts.max() bin_counts = tuple(bin_counts) # the first factor of 3 accounts for the intermediate visibilities # produced in Hessian (i.e. complex data + real weights) memory_per_row = (3 * max_chan_chunk * xds.WEIGHT.data.itemsize + 3 * xds.UVW.data.itemsize) # get approx image size pixel_bytes = np.dtype(args.output_type).itemsize band_size = nx * ny * pixel_bytes if args.host_address is None: # nworker bands on single node row_chunk = plan_row_chunk(args.mem_limit / args.nworkers, band_size, nrow, memory_per_row, args.nthreads_per_worker) else: # single band per node row_chunk = plan_row_chunk(args.mem_limit, band_size, nrow, memory_per_row, args.nthreads_per_worker) print( "nrows = %i, row chunks set to %i for a total of %i chunks per node" % (nrow, row_chunk, int(np.ceil(nrow / row_chunk))), file=log) def convolver(x): model = da.from_array(x, chunks=(1, nx, ny), name=False) xds = xds_from_zarr(args.weight_table, chunks={ 'row': row_chunk, 'chan': bin_counts })[0] convolvedim = hessian(xds.UVW.data, freqs, model, freq_bin_idx, freq_bin_counts, cell_rad, weights=xds.WEIGHT.data.astype( args.output_type), nthreads=args.nvthreads, epsilon=args.epsilon, do_wstacking=args.wstack, double_accum=args.double_accum) return convolvedim else: normfact = 1.0 from pfb.operators.psf import hessian from ducc0.fft import r2c iFs = np.fft.ifftshift npad_xl = (nx_psf - nx) // 2 npad_xr = nx_psf - nx - npad_xl npad_yl = (ny_psf - ny) // 2 npad_yr = ny_psf - ny - npad_yl padding = ((0, 0), (npad_xl, npad_xr), (npad_yl, npad_yr)) unpad_x = slice(npad_xl, -npad_xr) unpad_y = slice(npad_yl, -npad_yr) lastsize = ny + np.sum(padding[-1]) psf_pad = iFs(psf, axes=(1, 2)) psfhat = r2c(psf_pad, axes=(1, 2), forward=True, nthreads=nthreads, inorm=0) psfhat = da.from_array(psfhat, chunks=(1, -1, -1)) def convolver(x): model = da.from_array(x, chunks=(1, nx, ny), name=False) convolvedim = hessian(model, psfhat, padding, nvthreads, unpad_x, unpad_y, lastsize) return convolvedim # psfo = PSF(psf, dirty.shape, nthreads=args.nthreads) # def convolver(x): return psfo.convolve(x) rms = np.std(dirty_mfs) rmax = np.abs(dirty_mfs).max() print("Iter %i: peak residual = %f, rms = %f" % (0, rmax, rms), file=log) residual = dirty.copy() residual_mfs = dirty_mfs.copy() model = np.zeros_like(residual) for k in range(args.nmiter): print("Running Hogbom", file=log) x = hogbom(residual, psf, gamma=args.hb_gamma, pf=args.hb_peak_factor, maxit=args.hb_maxit, verbosity=args.hb_verbose, report_freq=args.hb_report_freq) model += x print("Getting residual", file=log) convimage = convolver(model) dask.visualize(convimage, filename=args.output_filename + '_hessian' + str(k) + '_graph.pdf', optimize_graph=False) with performance_report(filename=args.output_filename + '_hessian' + str(k) + '_per.html'): convimage = dask.compute(convimage, optimize_graph=False)[0] ne.evaluate('dirty - convimage/normfact', out=residual, casting='same_kind') ne.evaluate('sum(residual, axis=0)', out=residual_mfs, casting='same_kind') rms = np.std(residual_mfs) rmax = np.abs(residual_mfs).max() print("Iter %i: peak residual = %f, rms = %f" % (k + 1, rmax, rms), file=log) print("Saving results", file=log) save_fits(args.output_filename + '_model.fits', model, hdr) model_mfs = np.mean(model, axis=0) save_fits(args.output_filename + '_model_mfs.fits', model_mfs, hdr_mfs) save_fits(args.output_filename + '_residual.fits', residual * wsums[:, None, None], hdr) save_fits(args.output_filename + '_residual.fits', residual_mfs, hdr_mfs) print("All done here.", file=log)
import dask.array as da kwargs = { 'bgcolor': '#00000000', 'rankdir': 'BT', 'node_attr': { 'color': 'black', 'fontcolor': '#000000', 'penwidth': '3' }, 'edge_attr': { 'color': 'black', 'penwidth': '3' } } x = da.ones((15, 15), chunks=(5, 5)) x.mean(split_every=10).visualize('array-mean.svg', **kwargs) (x + x.T).visualize('array-xxT.svg', **kwargs) (x.dot(x.T + 1)).visualize('array-xdotxT.svg', **kwargs) (x.dot(x.T + 1) - x.mean()).visualize('array-xdotxT-mean.svg', **kwargs) (x.dot(x.T + 1) - x.mean()).std().visualize('array-xdotxT-mean-std.svg', **kwargs) N = 25 x = da.ones((N, N), chunks=(5, 5)) xxT = x + x.T U, S, V = da.linalg.svd(xxT.rechunk((5, N)) - x.mean()) dask.visualize(U, S, V, filename='array-svd.svg', **kwargs)
import dask import dask.array as da kwargs = {'bgcolor': '#00000000', 'rankdir': 'BT', 'node_attr': {'color': 'black', 'fontcolor': '#000000', 'penwidth': '3'}, 'edge_attr': {'color': 'black', 'penwidth': '3'}} x = da.ones((15, 15), chunks=(5, 5)) x.mean(split_every=10).visualize('array-mean.svg', **kwargs) (x + x.T).visualize('array-xxT.svg', **kwargs) (x.dot(x.T + 1)).visualize('array-xdotxT.svg', **kwargs) (x.dot(x.T + 1) - x.mean()).visualize('array-xdotxT-mean.svg', **kwargs) (x.dot(x.T + 1) - x.mean()).std().visualize('array-xdotxT-mean-std.svg', **kwargs) N = 25 x = da.ones((N, N), chunks=(5, 5)) xxT = x + x.T U, S, V = da.linalg.svd(xxT.rechunk((5, N)) - x.mean()) dask.visualize(U, S, V, filename='array-svd.svg', **kwargs)
'nwords': (len, (str.split, 'words')), 'val1': 'orange', 'val2': 'apple', 'val3': 'pear', 'count1': (str.count, 'words', 'val1'), 'count2': (str.count, 'words', 'val2'), 'count3': (str.count, 'words', 'val3'), 'out1': (format_str, 'count1', 'val1', 'nwords'), 'out2': (format_str, 'count2', 'val2', 'nwords'), 'out3': (format_str, 'count3', 'val3', 'nwords'), 'print1': (print_and_return, 'out1'), 'print2': (print_and_return, 'out2'), 'print3': (print_and_return, 'out3') } dask.visualize(dsk, filename='/Users/longguangbin/Work/temp/dask2.pdf') from dask.threaded import get from dask.optimization import cull from dask.optimization import inline outputs = ['print1', 'print2'] results = get(dsk, outputs) dsk1, dependencies = cull(dsk, outputs) dsk2 = inline(dsk1, dependencies=dependencies) results = get(dsk2, outputs) # https://docs.dask.org/en/latest/optimize.html
def visualize_job(self, name): if self.jobs.get(name): dask.visualize(self.jobs['name']['future'])
def _predict(ms, stack, **kw): args = OmegaConf.create(kw) OmegaConf.set_struct(args, True) pyscilog.log_to_file(args.output_filename + '.log') pyscilog.enable_memory_logging(level=3) # number of threads per worker if args.nthreads is None: if args.host_address is not None: raise ValueError( "You have to specify nthreads when using a distributed scheduler" ) import multiprocessing nthreads = multiprocessing.cpu_count() args.nthreads = nthreads else: nthreads = args.nthreads if args.mem_limit is None: if args.host_address is not None: raise ValueError( "You have to specify mem-limit when using a distributed scheduler" ) import psutil mem_limit = int(psutil.virtual_memory()[0] / 1e9) # 100% of memory by default args.mem_limit = mem_limit else: mem_limit = args.mem_limit nband = args.nband if args.nworkers is None: nworkers = nband args.nworkers = nworkers else: nworkers = args.nworkers if args.nthreads_per_worker is None: nthreads_per_worker = 1 args.nthreads_per_worker = nthreads_per_worker else: nthreads_per_worker = args.nthreads_per_worker # the number of chunks being read in simultaneously is equal to # the number of dask threads nthreads_dask = nworkers * nthreads_per_worker if args.ngridder_threads is None: if args.host_address is not None: ngridder_threads = nthreads // nthreads_per_worker else: ngridder_threads = nthreads // nthreads_dask args.ngridder_threads = ngridder_threads else: ngridder_threads = args.ngridder_threads ms = list(ms) print('Input Options:', file=log) for key in kw.keys(): print(' %25s = %s' % (key, args[key]), file=log) # numpy imports have to happen after this step from pfb import set_client set_client(nthreads, mem_limit, nworkers, nthreads_per_worker, args.host_address, stack, log) import numpy as np from pfb.utils.misc import chan_to_band_mapping import dask from dask.distributed import performance_report from dask.graph_manipulation import clone from daskms import xds_from_storage_ms as xds_from_ms from daskms import xds_from_storage_table as xds_from_table from daskms.utils import dataset_type mstype = dataset_type(ms[0]) if mstype == 'casa': from daskms import xds_to_table elif mstype == 'zarr': from daskms.experimental.zarr import xds_to_zarr as xds_to_table import dask.array as da from africanus.constants import c as lightspeed from africanus.gridding.wgridder.dask import model as im2vis from pfb.utils.fits import load_fits from pfb.utils.misc import restore_corrs, plan_row_chunk from astropy.io import fits # always returns 4D # gridder expects freq axis model = np.atleast_3d(load_fits(args.model).squeeze()) nband, nx, ny = model.shape hdr = fits.getheader(args.model) cell_d = np.abs(hdr['CDELT1']) cell_rad = np.deg2rad(cell_d) # chan <-> band mapping freqs, freq_bin_idx, freq_bin_counts, freq_out, band_mapping, chan_chunks = chan_to_band_mapping( ms, nband=nband) # degridder memory budget max_chan_chunk = 0 for ims in ms: for spw in freqs[ims]: counts = freq_bin_counts[ims][spw].compute() max_chan_chunk = np.maximum(max_chan_chunk, counts.max()) # assumes number of correlations are the same across MS/SPW xds = xds_from_ms(ms[0]) ncorr = xds[0].dims['corr'] nrow = xds[0].dims['row'] if args.output_type is not None: output_type = np.dtype(args.output_type) else: output_type = np.result_type(np.dtype(args.real_type), np.complex64) data_bytes = output_type.itemsize bytes_per_row = max_chan_chunk * ncorr * data_bytes memory_per_row = bytes_per_row # model memory_per_row += 3 * 8 # uvw if mstype == 'zarr': if args.model_column in xds[0].keys(): model_chunks = getattr(xds[0], args.model_column).data.chunks else: model_chunks = xds[0].DATA.data.chunks print('Chunking model same as data') # get approx image size # this is not a conservative estimate when multiple SPW's map to a single # imaging band pixel_bytes = np.dtype(args.output_type).itemsize band_size = nx * ny * pixel_bytes if args.host_address is None: # full image on single node row_chunk = plan_row_chunk(mem_limit / nworkers, band_size, nrow, memory_per_row, nthreads_per_worker) else: # single band per node row_chunk = plan_row_chunk(mem_limit, band_size, nrow, memory_per_row, nthreads_per_worker) if args.row_chunks is not None: row_chunk = int(args.row_chunks) if row_chunk == -1: row_chunk = nrow print( "nrows = %i, row chunks set to %i for a total of %i chunks per node" % (nrow, row_chunk, int(np.ceil(nrow / row_chunk))), file=log) chunks = {} for ims in ms: chunks[ims] = [] # xds_from_ms expects a list per ds for spw in freqs[ims]: chunks[ims].append({ 'row': row_chunk, 'chan': chan_chunks[ims][spw]['chan'] }) model = da.from_array(model.astype(args.real_type), chunks=(1, nx, ny), name=False) writes = [] radec = None # assumes we are only imaging field 0 of first MS for ims in ms: xds = xds_from_ms(ims, chunks=chunks[ims], columns=('UVW')) # subtables ddids = xds_from_table(ims + "::DATA_DESCRIPTION") fields = xds_from_table(ims + "::FIELD") spws = xds_from_table(ims + "::SPECTRAL_WINDOW") pols = xds_from_table(ims + "::POLARIZATION") # subtable data ddids = dask.compute(ddids)[0] fields = dask.compute(fields)[0] spws = dask.compute(spws)[0] pols = dask.compute(pols)[0] out_data = [] for ds in xds: field = fields[ds.FIELD_ID] radec = field.PHASE_DIR.data.squeeze() # check fields match if radec is None: radec = field.PHASE_DIR.data.squeeze() if not np.array_equal(radec, field.PHASE_DIR.data.squeeze()): continue spw = ds.DATA_DESC_ID # this is not correct, need to use spw uvw = clone(ds.UVW.data) bands = band_mapping[ims][spw] model = model[list(bands), :, :] vis = im2vis(uvw, freqs[ims][spw], model, freq_bin_idx[ims][spw], freq_bin_counts[ims][spw], cell_rad, nthreads=ngridder_threads, epsilon=args.epsilon, do_wstacking=args.wstack) model_vis = restore_corrs(vis, ncorr) if mstype == 'zarr': model_vis = model_vis.rechunk(model_chunks) uvw = uvw.rechunk((model_chunks[0], 3)) out_ds = ds.assign( **{ args.model_column: (("row", "chan", "corr"), model_vis), 'UVW': (("row", "three"), uvw) }) # out_ds = ds.assign(**{args.model_column: (("row", "chan", "corr"), model_vis)}) out_data.append(out_ds) writes.append(xds_to_table(out_data, ims, columns=[args.model_column])) dask.visualize(*writes, filename=args.output_filename + '_predict_graph.pdf', optimize_graph=False, collapse_outputs=True) if not args.mock: with performance_report(filename=args.output_filename + '_predict_per.html'): dask.compute(writes, optimize_graph=False) print("All done here.", file=log)
def do_something_1(x, y): return x + y + 2*x*y def do_something_2(a, b): return a**3 - b**3 def do_something_3(p, q): return p*p + q*q def do_something_4(x): return x * 3 # define the graph dsk = { 'thrice_1': (do_something_4, 10), 'thrice_2': (do_something_4, 20), 'thrice_3': (do_something_4, 30), 'thrice_4': (do_something_4, 40), 'square_sum': (do_something_3, 'thrice_1', 'thrice_2'), 'a_plus_b_wholeSqaure': (do_something_1, 'square_sum', 'thrice_3'), 'some_complex_stuff': (do_something_2, 'thrice_4', 'a_plus_b_wholeSqaure') } print(get(dsk, 'some_complex_stuff')) visualize(dsk, rankdir="LR", filename="task_graph.png") # Do More.....
avg_by_postalcode.compute() # In[ ]: ops_by_postcalcode = narrow_df.set_index("PostCode", npartitions=10) len(list(ops_by_postcalcode.partitions)) # In[ ]: # Le sad, you can see this doesn't actually respect the partition size of one byte. dask.visualize(narrow_df.set_index("PostCode", npartitions="auto", partition_size=1)) # In[ ]: indexed = narrow_df.set_index("PostCode") #tag::repartition[] reparted = indexed.repartition(partition_size="20kb") #end::repartition[] dask.visualize(narrow_df.set_index("PostCode").repartition(partition_size="20kb")) # In[ ]:
# %% {"slideshow": {"slide_type": "fragment"}} %%time mean_delay_res, std_delay_res = dask.compute(mean_delay, std_delay) # %% [markdown] {"slideshow": {"slide_type": "slide"}} # Using `dask.compute` takes roughly 1/2 the time. This is because the task graphs for both results are merged when calling `dask.compute`, allowing shared operations to only be done once instead of twice. In particular, using `dask.compute` only does the following once: # # - the calls to `read_csv` # - the filter (`df[~df.Cancelled]`) # - some of the necessary reductions (`sum`, `count`) # # To see what the merged task graphs between multiple results look like (and what's shared), you can use the `dask.visualize` function (we might want to use `filename='graph.pdf'` to zoom in on the graph better): # %% {"slideshow": {"slide_type": "slide"}} dask.visualize(mean_delay, std_delay) # %% [markdown] {"slideshow": {"slide_type": "slide"}} # ## Converting `CRSDepTime` to a timestamp # # This dataset stores timestamps as `HHMM`, which are read in as integers in `read_csv`: # %% {"slideshow": {"slide_type": "fragment"}} # recreate the read_csv task with parsed dates df = dd.read_csv(filename, parse_dates={'Date': [0, 1, 2]}, dtype={'TailNum': str, 'CRSElapsedTime': float, 'Cancelled': bool}) # %% {"slideshow": {"slide_type": "fragment"}}
self.features = features i = NumpyInfo("boo", np.array(0)) numpybits = [i] # Surprisingly this works, despite the implication that we would need to call register_generic from distributed.protocol import register_generic register_generic(NumpyInfo) dask.compute(ok_fun(1)) #end::serialize_class_with_numpy[] # In[ ]: dask.visualize(ok_fun(1)) # In[ ]: ok_fun(1).visualize() # In[ ]: ok_fun(1) # In[ ]: # From ch2 for visualize @dask.delayed def crawl(url, depth=0, maxdepth=1, maxlinks=4):
def show_workflow(self, output_filepath=None): dask.visualize(self._result, filename=output_filepath)
cv=3, n_jobs=-1) with joblib.parallel_backend("dask"): grid_search.fit(X, y) return np.sum(grid_search.predict(X)[:5]) if __name__ == "__main__": import networkx os.makedirs("graphs", exist_ok=True) usecases = { "pandas-groupby-1-1T-1H": bench_pandas_groupby(1, "1T", "1H"), "pandas-groupby-1-1T-8H": bench_pandas_groupby(1, "1T", "8H"), "pandas-join-1-1T-1H": bench_pandas_join(1, "1T", "1H"), "pandas-join-1-1T-8H": bench_pandas_join(1, "1T", "8H"), "bag-1000": bench_bag(1000), "merge-1000": bench_merge(1000), "numpy-2000": bench_numpy(2000), "tree-8": bench_tree(8), "xarray-20": bench_xarray(20) } for (name, graph) in usecases.items(): dot_filename = f"graphs/{name}" dask.visualize(graph, format="dot", filename=dot_filename) dask.visualize(graph, filename=f"graphs/{name}.svg") g = networkx.drawing.nx_agraph.read_dot(f"{dot_filename}.dot") print(f""" {name}: {len(g.nodes)} vertices, {len(g.edges)} edges, longest path: {networkx.dag_longest_path_length(g)} """.strip())