def get_colors(palette, funcs): """Get a dict mapping funcs to colors from palette. Parameters ---------- palette : string Name of the bokeh palette to use, must be a member of bokeh.palettes.all_palettes. funcs : iterable Iterable of function names """ palettes = import_required("bokeh.palettes", _BOKEH_MISSING_MSG) unique_funcs = sorted(unique(funcs)) n_funcs = len(unique_funcs) palette_lookup = palettes.all_palettes[palette] keys = list(sorted(palette_lookup.keys())) index = keys[min(bisect_left(keys, n_funcs), len(keys) - 1)] palette = palette_lookup[index] # Some bokeh palettes repeat colors, we want just the unique set palette = list(unique(palette)) if len(palette) > n_funcs: # Consistently shuffle palette - prevents just using low-range random.Random(42).shuffle(palette) color_lookup = dict(zip(unique_funcs, cycle(palette))) return [color_lookup[n] for n in funcs]
def read_avro(urlpath, blocksize=100000000, storage_options=None): """Read set of avro files Use this with arbitrary nested avro schemas. Please refer to the fastavro documentation for its capabilities: https://github.com/fastavro/fastavro Parameters ---------- urlpath: string or list Absolute or relative filepath, URL (may include protocols like ``s3://``), or globstring pointing to data. blocksize: int or None Size of chunks in bytes. If None, there will be no chunking and each file will become one partition. storage_options: dict or None passed to backend file-system """ from dask.utils import import_required from dask import delayed, compute from dask.bytes.core import open_files, read_bytes from dask.bag import from_delayed import_required( 'fastavro', "fastavro is a required dependency for using " "bag.read_avro().") storage_options = storage_options or {} files = open_files(urlpath, **storage_options) if blocksize is not None: dhead = delayed(open_head) heads = compute(*[dhead(f) for f in files]) dread = delayed(read_chunk) bits = [] for head, f in zip(heads, files): _, chunks = read_bytes(f.path, sample=False, blocksize=blocksize, delimiter=head['sync'], include_path=False, **storage_options) bits.extend([dread(ch, head) for ch in chunks[0]]) return from_delayed(bits) else: files = open_files(urlpath, **storage_options) dread = delayed(read_file) chunks = [dread(fo) for fo in files] return from_delayed(chunks)
def read_avro(urlpath, blocksize=100000000, storage_options=None): """Read set of avro files Use this with arbitrary nested avro schemas. Please refer to the fastavro documentation for its capabilities: https://github.com/fastavro/fastavro Parameters ---------- urlpath: string or list Absolute or relative filepath, URL (may include protocols like ``s3://``), or globstring pointing to data. blocksize: int or None Size of chunks in bytes. If None, there will be no chunking and each file will become one partition. storage_options: dict or None passed to backend file-system """ from dask.utils import import_required from dask import delayed, compute from dask.bytes.core import open_files, read_bytes from dask.bag import from_delayed import_required('fastavro', "fastavro is a required dependency for using " "bag.read_avro().") storage_options = storage_options or {} files = open_files(urlpath, **storage_options) if blocksize is not None: dhead = delayed(open_head) heads = compute(*[dhead(f) for f in files]) dread = delayed(read_chunk) bits = [] for head, f in zip(heads, files): _, chunks = read_bytes(f.path, sample=False, blocksize=blocksize, delimiter=head['sync'], include_path=False, **storage_options) bits.extend([dread(ch, head) for ch in chunks[0]]) return from_delayed(bits) else: files = open_files(urlpath, **storage_options) dread = delayed(read_file) chunks = [dread(fo) for fo in files] return from_delayed(chunks)
def make_people(npartitions=10, records_per_partition=1000, seed=None, locale="en"): """Make a dataset of random people This makes a Dask Bag with dictionary records of randomly generated people. This requires the optional library ``mimesis`` to generate records. Parameters ---------- npartitions : int Number of partitions records_per_partition : int Number of records in each partition seed : int, (optional) Random seed locale : str Language locale, like 'en', 'fr', 'zh', or 'ru' Returns ------- b: Dask Bag """ import_required( "mimesis", "The mimesis module is required for this function. Try:\n" " python -m pip install mimesis", ) schema = lambda field: { "age": field("person.age"), "name": (field("person.name"), field("person.surname")), "occupation": field("person.occupation"), "telephone": field("person.telephone"), "address": {"address": field("address.address"), "city": field("address.city")}, "credit-card": { "number": field("payment.credit_card_number"), "expiration-date": field("payment.credit_card_expiration_date"), }, } return _make_mimesis( {"locale": locale}, schema, npartitions, records_per_partition, seed )
def run(self): psutil = import_required( "psutil", "Tracking resource usage requires `psutil` to be installed" ) self.parent = psutil.Process(self.parent_pid) pid = current_process() data = [] while True: try: msg = self.child_conn.recv() except KeyboardInterrupt: continue if msg == "shutdown": break elif msg == "collect": ps = self._update_pids(pid) while not data or not self.child_conn.poll(): tic = default_timer() mem = cpu = 0 for p in ps: try: mem2 = p.memory_info().rss cpu2 = p.cpu_percent() except Exception: # could be a few different exceptions pass else: # Only increment if both were successful mem += mem2 cpu += cpu2 data.append((tic, mem / 1e6, cpu)) sleep(self.dt) elif msg == "send_data": self.child_conn.send(data) data = [] self.child_conn.close()
def initialize_event_loop(config): event_loop = dask.config.get("distributed.admin.event-loop") if event_loop == "uvloop": uvloop = import_required( "uvloop", "The distributed.admin.event-loop configuration value " "is set to 'uvloop' but the uvloop module is not installed" "\n\n" "Please either change the config value or install one of the following\n" " conda install uvloop\n" " pip install uvloop", ) uvloop.install() elif event_loop in {"asyncio", "tornado"}: if WINDOWS: # WindowsProactorEventLoopPolicy is not compatible with tornado 6 # fallback to the pre-3.8 default of Selector # https://github.com/tornadoweb/tornado/issues/2608 asyncio.set_event_loop_policy( asyncio.WindowsSelectorEventLoopPolicy()) else: raise ValueError( "Expected distributed.admin.event-loop to be in ('asyncio', 'tornado', 'uvloop'), got %s" % dask.config.get("distributed.admin.event-loop"))
def cytoscape_graph( dsk, filename: str | None = "mydask", format: str | None = None, *, rankdir: str = "BT", node_sep: float = 10, edge_sep: float = 10, spacing_factor: float = 1, node_style: dict[str, str] | None = None, edge_style: dict[str, str] | None = None, **kwargs, ): """ Create an ipycytoscape widget for a dask graph. If `filename` is not None, write an HTML file to disk with the specified name. This uses the Cytoscape dagre layout algorithm. Options for that are documented here: https://github.com/cytoscape/cytoscape.js-dagre#api Parameters ---------- dsk : dict The graph to display. filename : str or None, optional The name of the HTML file to write to disk. format : str, optional Not used in this engine. rankdir: str The direction in which to orient the visualization. node_sep: float The separation (in px) between nodes in the DAG layout. edge_sep: float The separation (in px) between edges in the DAG layout. spacing_factor: float An overall scaling factor to increase (>1) or decrease (<1) the spacing of the layout. node_style: dict[str, str], optional A dictionary of style attributes for nodes (refer to Cytoscape JSON docs for available options: https://js.cytoscape.org/#notation/elements-json) edge_style: dict[str, str], optional A dictionary of style attributes for edges (refer to Cytoscape JSON docs for available options: https://js.cytoscape.org/#notation/elements-json) **kwargs Additional keyword arguments to forward to `_to_cytoscape_json`. Returns ------- result : ipycytoscape.CytoscapeWidget """ ipycytoscape = import_required( "ipycytoscape", "Drawing dask graphs with the cytoscape engine requires the `ipycytoscape` " "python library.\n\n" "Please either conda or pip install as follows:\n\n" " conda install ipycytoscape # either conda install\n" " python -m pip install ipycytoscape # or pip install", ) node_style = node_style or {} edge_style = edge_style or {} data = _to_cytoscape_json(dsk, **kwargs) # TODO: it's not easy to programmatically increase the height of the widget. # Ideally we would make it a bit bigger, but that will probably require upstreaming # some fixes. g = ipycytoscape.CytoscapeWidget( layout={"height": "400px"}, ) g.set_layout( name="dagre", rankDir=rankdir, nodeSep=node_sep, edgeSep=edge_sep, spacingFactor=spacing_factor, nodeDimensionsIncludeLabels=True, ) g.graph.add_graph_from_json( data, directed=True, ) g.set_style( [ { "selector": "node", "style": { "font-family": "helvetica", "font-size": "24px", "font-weight": "bold", "color": "black", "background-color": "#eee", "border-color": "data(color)", "border-width": 4, "opacity": "1.0", "text-valign": "center", "text-halign": "center", "label": "data(label)", "shape": "data(shape)", "width": 64, "height": 64, **node_style, }, }, { "selector": "edge", "style": { "width": 8, "line-color": "gray", "target-arrow-shape": "triangle", "target-arrow-color": "gray", "curve-style": "bezier", **edge_style, }, }, ], ) # Tweak the zoom sensitivity z = g.zoom g.max_zoom = z * 2.0 g.min_zoom = z / 10.0 g.wheel_sensitivity = 0.1 if filename is not None: from ipywidgets.embed import embed_minimal_html filename = filename if filename.endswith(".html") else filename + ".html" embed_minimal_html(filename, views=[g], title="Dask task graph") return g
def to_graphviz( dsk, data_attributes=None, function_attributes=None, rankdir="BT", graph_attr=None, node_attr=None, edge_attr=None, collapse_outputs=False, verbose=False, **kwargs, ): graphviz = import_required( "graphviz", "Drawing dask graphs with the graphviz engine requires the `graphviz` " "python library and the `graphviz` system library.\n\n" "Please either conda or pip install as follows:\n\n" " conda install python-graphviz # either conda install\n" " python -m pip install graphviz # or pip install and follow installation instructions", ) data_attributes = data_attributes or {} function_attributes = function_attributes or {} graph_attr = graph_attr or {} node_attr = node_attr or {} edge_attr = edge_attr or {} graph_attr["rankdir"] = rankdir node_attr["fontname"] = "helvetica" graph_attr.update(kwargs) g = graphviz.Digraph( graph_attr=graph_attr, node_attr=node_attr, edge_attr=edge_attr ) seen = set() connected = set() for k, v in dsk.items(): k_name = name(k) if istask(v): func_name = name((k, "function")) if not collapse_outputs else k_name if collapse_outputs or func_name not in seen: seen.add(func_name) attrs = function_attributes.get(k, {}).copy() attrs.setdefault("label", key_split(k)) attrs.setdefault("shape", "circle") g.node(func_name, **attrs) if not collapse_outputs: g.edge(func_name, k_name) connected.add(func_name) connected.add(k_name) for dep in get_dependencies(dsk, k): dep_name = name(dep) if dep_name not in seen: seen.add(dep_name) attrs = data_attributes.get(dep, {}).copy() attrs.setdefault("label", box_label(dep, verbose)) attrs.setdefault("shape", "box") g.node(dep_name, **attrs) g.edge(dep_name, func_name) connected.add(dep_name) connected.add(func_name) elif ishashable(v) and v in dsk: v_name = name(v) g.edge(v_name, k_name) connected.add(v_name) connected.add(k_name) if (not collapse_outputs or k_name in connected) and k_name not in seen: seen.add(k_name) attrs = data_attributes.get(k, {}).copy() attrs.setdefault("label", box_label(k, verbose)) attrs.setdefault("shape", "box") g.node(k_name, **attrs) return g
def read_avro(urlpath, blocksize=100000000, storage_options=None, compression=None): """Read set of avro files Use this with arbitrary nested avro schemas. Please refer to the fastavro documentation for its capabilities: https://github.com/fastavro/fastavro Parameters ---------- urlpath: string or list Absolute or relative filepath, URL (may include protocols like ``s3://``), or globstring pointing to data. blocksize: int or None Size of chunks in bytes. If None, there will be no chunking and each file will become one partition. storage_options: dict or None passed to backend file-system compression: str or None Compression format of the targe(s), like 'gzip'. Should only be used with blocksize=None. """ from dask.utils import import_required from dask import delayed, compute from dask.bytes.core import (open_files, get_fs_token_paths, OpenFile, tokenize) from dask.bag import from_delayed import_required('fastavro', "fastavro is a required dependency for using " "bag.read_avro().") storage_options = storage_options or {} if blocksize is not None: fs, fs_token, paths = get_fs_token_paths( urlpath, mode='rb', storage_options=storage_options) dhead = delayed(open_head) out = compute(*[dhead(fs, path, compression) for path in paths]) heads, sizes = zip(*out) dread = delayed(read_chunk) offsets = [] lengths = [] for size in sizes: off = list(range(0, size, blocksize)) length = [blocksize] * len(off) offsets.append(off) lengths.append(length) out = [] for path, offset, length, head in zip(paths, offsets, lengths, heads): delimiter = head['sync'] f = OpenFile(fs, path, compression=compression) token = tokenize(fs_token, delimiter, path, fs.ukey(path), compression, offset) keys = ['read-avro-%s-%s' % (o, token) for o in offset] values = [dread(f, o, l, head, dask_key_name=key) for o, key, l in zip(offset, keys, length)] out.extend(values) return from_delayed(out) else: files = open_files(urlpath, compression=compression, **storage_options) dread = delayed(read_file) chunks = [dread(fo) for fo in files] return from_delayed(chunks)
def read_avro(urlpath, blocksize=100000000, storage_options=None, compression=None): """Read set of avro files Use this with arbitrary nested avro schemas. Please refer to the fastavro documentation for its capabilities: https://github.com/fastavro/fastavro Parameters ---------- urlpath: string or list Absolute or relative filepath, URL (may include protocols like ``s3://``), or globstring pointing to data. blocksize: int or None Size of chunks in bytes. If None, there will be no chunking and each file will become one partition. storage_options: dict or None passed to backend file-system compression: str or None Compression format of the targe(s), like 'gzip'. Should only be used with blocksize=None. """ from dask.utils import import_required from dask import delayed, compute from dask.bytes.core import (open_files, get_fs_token_paths, OpenFile, tokenize) from dask.bag import from_delayed import_required( 'fastavro', "fastavro is a required dependency for using " "bag.read_avro().") storage_options = storage_options or {} if blocksize is not None: fs, fs_token, paths = get_fs_token_paths( urlpath, mode='rb', storage_options=storage_options) dhead = delayed(open_head) out = compute(*[dhead(fs, path, compression) for path in paths]) heads, sizes = zip(*out) dread = delayed(read_chunk) offsets = [] lengths = [] for size in sizes: off = list(range(0, size, blocksize)) length = [blocksize] * len(off) offsets.append(off) lengths.append(length) out = [] for path, offset, length, head in zip(paths, offsets, lengths, heads): delimiter = head['sync'] f = OpenFile(fs, path, compression=compression) token = tokenize(fs_token, delimiter, path, fs.ukey(path), compression, offset) keys = ['read-avro-%s-%s' % (o, token) for o in offset] values = [ dread(f, o, l, head, dask_key_name=key) for o, key, l in zip(offset, keys, length) ] out.extend(values) return from_delayed(out) else: files = open_files(urlpath, **storage_options) dread = delayed(read_file) chunks = [dread(fo) for fo in files] return from_delayed(chunks)
def to_avro(b, filename, schema, name_function=None, storage_options=None, codec="null", sync_interval=16000, metadata=None, compute=True, **kwargs): """Write bag to set of avro files The schema is a complex dictionary describing the data, see https://avro.apache.org/docs/1.8.2/gettingstartedpython.html#Defining+a+schema and https://fastavro.readthedocs.io/en/latest/writer.html . It's structure is as follows:: {'name': 'Test', 'namespace': 'Test', 'doc': 'Descriptive text', 'type': 'record', 'fields': [ {'name': 'a', 'type': 'int'}, ]} where the "name" field is required, but "namespace" and "doc" are optional descriptors; "type" must always be "record". The list of fields should have an entry for every key of the input records, and the types are like the primitive, complex or logical types of the Avro spec ( https://avro.apache.org/docs/1.8.2/spec.html ). Results in one avro file per input partition. Parameters ---------- b: dask.bag.Bag filename: list of str or str Filenames to write to. If a list, number must match the number of partitions. If a string, must includ a glob character "*", which will be expanded using name_function schema: dict Avro schema dictionary, see above name_function: None or callable Expands integers into strings, see ``dask.bytes.utils.build_name_function`` storage_options: None or dict Extra key/value options to pass to the backend file-system codec: 'null', 'deflate', or 'snappy' Compression algorithm sync_interval: int Number of records to include in each block within a file metadata: None or dict Included in the file header compute: bool If True, files are written immediately, and function blocks. If False, returns delayed objects, which can be computed by the user where convenient. kwargs: passed to compute(), if compute=True Examples -------- >>> import dask.bag as db >>> b = db.from_sequence([{'name': 'Alice', 'value': 100}, ... {'name': 'Bob', 'value': 200}]) >>> schema = {'name': 'People', 'doc': "Set of people's scores", ... 'type': 'record', ... 'fields': [ ... {'name': 'name', 'type': 'string'}, ... {'name': 'value', 'type': 'int'}]} >>> b.to_avro('my-data.*.avro', schema) # doctest: +SKIP ['my-data.0.avro', 'my-data.1.avro'] """ # TODO infer schema from first partition of data from dask.utils import import_required from dask.bytes.core import open_files import_required( "fastavro", "fastavro is a required dependency for using bag.to_avro().") _verify_schema(schema) storage_options = storage_options or {} files = open_files(filename, "wb", name_function=name_function, num=b.npartitions, **storage_options) name = "to-avro-" + uuid.uuid4().hex dsk = {(name, i): ( _write_avro_part, (b.name, i), f, schema, codec, sync_interval, metadata, ) for i, f in enumerate(files)} graph = HighLevelGraph.from_collections(name, dsk, dependencies=[b]) out = type(b)(graph, name, b.npartitions) if compute: out.compute(**kwargs) return [f.path for f in files] else: return out.to_delayed()
def plot_cache( results, dsk, start_time, metric_name, palette="Viridis", label_size=60, **kwargs ): """Visualize the results of profiling in a bokeh plot. Parameters ---------- results : sequence Output of CacheProfiler.results dsk : dict The dask graph being profiled. start_time : float Start time of the profile. metric_name : string Metric used to measure cache size palette : string, optional Name of the bokeh palette to use, must be a member of bokeh.palettes.all_palettes. label_size: int (optional) Maximum size of output labels in plot, defaults to 60 **kwargs Other keyword arguments, passed to bokeh.figure. These will override all defaults set by visualize. Returns ------- The completed bokeh plot object. """ bp = import_required("bokeh.plotting", _BOKEH_MISSING_MSG) from bokeh.models import HoverTool defaults = dict( title="Profile Results", tools="hover,save,reset,wheel_zoom,xpan", toolbar_location="above", width=800, height=300, ) # Support plot_width and plot_height for backwards compatibility if "plot_width" in kwargs: kwargs["width"] = kwargs.pop("plot_width") if BOKEH_VERSION().major >= 3: warnings.warn("Use width instead of plot_width with Bokeh >= 3") if "plot_height" in kwargs: kwargs["height"] = kwargs.pop("plot_height") if BOKEH_VERSION().major >= 3: warnings.warn("Use height instead of plot_height with Bokeh >= 3") defaults.update(**kwargs) if results: starts, ends = list(zip(*results))[3:] tics = sorted(unique(starts + ends)) groups = groupby(lambda d: pprint_task(d[1], dsk, label_size), results) data = {} for k, vals in groups.items(): cnts = dict.fromkeys(tics, 0) for v in vals: cnts[v.cache_time] += v.metric cnts[v.free_time] -= v.metric data[k] = [0] + list(accumulate(add, pluck(1, sorted(cnts.items())))) tics = [0] + [i - start_time for i in tics] p = bp.figure(x_range=[0, max(tics)], **defaults) for (key, val), color in zip(data.items(), get_colors(palette, data.keys())): p.line( "x", "y", line_color=color, line_width=3, source=bp.ColumnDataSource( {"x": tics, "y": val, "label": [key for i in val]} ), ) else: p = bp.figure(y_range=[0, 10], x_range=[0, 10], **defaults) p.yaxis.axis_label = f"Cache Size ({metric_name})" p.xaxis.axis_label = "Time (s)" hover = p.select(HoverTool) hover.tooltips = """ <div> <span style="font-size: 14px; font-weight: bold;">Task:</span> <span style="font-size: 10px; font-family: Monaco, monospace;">@label</span> </div> """ return p
def plot_resources(results, palette="Viridis", **kwargs): """Plot resource usage in a bokeh plot. Parameters ---------- results : sequence Output of ResourceProfiler.results palette : string, optional Name of the bokeh palette to use, must be a member of bokeh.palettes.all_palettes. **kwargs Other keyword arguments, passed to bokeh.figure. These will override all defaults set by plot_resources. Returns ------- The completed bokeh plot object. """ bp = import_required("bokeh.plotting", _BOKEH_MISSING_MSG) from bokeh import palettes from bokeh.models import LinearAxis, Range1d defaults = dict( title="Profile Results", tools="save,reset,xwheel_zoom,xpan", toolbar_location="above", width=800, height=300, ) # Support plot_width and plot_height for backwards compatibility if "plot_width" in kwargs: kwargs["width"] = kwargs.pop("plot_width") if BOKEH_VERSION().major >= 3: warnings.warn("Use width instead of plot_width with Bokeh >= 3") if "plot_height" in kwargs: kwargs["height"] = kwargs.pop("plot_height") if BOKEH_VERSION().major >= 3: warnings.warn("Use height instead of plot_height with Bokeh >= 3") # Drop `label_size` to match `plot_cache` and `plot_tasks` kwargs if "label_size" in kwargs: kwargs.pop("label_size") defaults.update(**kwargs) if results: t, mem, cpu = zip(*results) left, right = min(t), max(t) t = [i - left for i in t] p = bp.figure( y_range=fix_bounds(0, max(cpu), 100), x_range=fix_bounds(0, right - left, 1), **defaults, ) else: t = mem = cpu = [] p = bp.figure(y_range=(0, 100), x_range=(0, 1), **defaults) colors = palettes.all_palettes[palette][6] p.line( t, cpu, color=colors[0], line_width=4, legend_label="% CPU", ) p.yaxis.axis_label = "% CPU" p.extra_y_ranges = { "memory": Range1d( *fix_bounds(min(mem) if mem else 0, max(mem) if mem else 100, 100) ) } p.line( t, mem, color=colors[2], y_range_name="memory", line_width=4, legend_label="Memory", ) p.add_layout(LinearAxis(y_range_name="memory", axis_label="Memory (MB)"), "right") p.xaxis.axis_label = "Time (s)" return p
def plot_tasks(results, dsk, palette="Viridis", label_size=60, **kwargs): """Visualize the results of profiling in a bokeh plot. Parameters ---------- results : sequence Output of Profiler.results dsk : dict The dask graph being profiled. palette : string, optional Name of the bokeh palette to use, must be a member of bokeh.palettes.all_palettes. label_size: int (optional) Maximum size of output labels in plot, defaults to 60 **kwargs Other keyword arguments, passed to bokeh.figure. These will override all defaults set by visualize. Returns ------- The completed bokeh plot object. """ bp = import_required("bokeh.plotting", _BOKEH_MISSING_MSG) from bokeh.models import HoverTool defaults = dict( title="Profile Results", tools="hover,save,reset,xwheel_zoom,xpan", toolbar_location="above", width=800, height=300, ) # Support plot_width and plot_height for backwards compatibility if "plot_width" in kwargs: kwargs["width"] = kwargs.pop("plot_width") if "plot_height" in kwargs: kwargs["height"] = kwargs.pop("plot_height") defaults.update(**kwargs) if results: keys, tasks, starts, ends, ids = zip(*results) id_group = groupby(itemgetter(4), results) timings = { k: [i.end_time - i.start_time for i in v] for (k, v) in id_group.items() } id_lk = { t[0]: n for (n, t) in enumerate( sorted(timings.items(), key=itemgetter(1), reverse=True) ) } left = min(starts) right = max(ends) p = bp.figure( y_range=[str(i) for i in range(len(id_lk))], x_range=[0, right - left], **defaults, ) data = {} data["width"] = width = [e - s for (s, e) in zip(starts, ends)] data["x"] = [w / 2 + s - left for (w, s) in zip(width, starts)] data["y"] = [id_lk[i] + 1 for i in ids] data["function"] = funcs = [pprint_task(i, dsk, label_size) for i in tasks] data["color"] = get_colors(palette, funcs) data["key"] = [str(i) for i in keys] source = bp.ColumnDataSource(data=data) p.rect( source=source, x="x", y="y", height=1, width="width", color="color", line_color="gray", ) else: p = bp.figure(y_range=[str(i) for i in range(8)], x_range=[0, 10], **defaults) p.grid.grid_line_color = None p.axis.axis_line_color = None p.axis.major_tick_line_color = None p.yaxis.axis_label = "Worker ID" p.xaxis.axis_label = "Time (s)" hover = p.select(HoverTool) hover.tooltips = """ <div> <span style="font-size: 14px; font-weight: bold;">Key:</span> <span style="font-size: 10px; font-family: Monaco, monospace;">@key</span> </div> <div> <span style="font-size: 14px; font-weight: bold;">Task:</span> <span style="font-size: 10px; font-family: Monaco, monospace;">@function</span> </div> """ hover.point_policy = "follow_mouse" return p
def visualize( profilers, filename="profile.html", show=True, save=None, mode=None, **kwargs ): """Visualize the results of profiling in a bokeh plot. If multiple profilers are passed in, the plots are stacked vertically. Parameters ---------- profilers : profiler or list Profiler or list of profilers. filename : string, optional Name of the plot output file. show : boolean, optional If True (default), the plot is opened in a browser. save : boolean, optional If True (default when not in notebook), the plot is saved to disk. mode : str, optional Mode passed to bokeh.output_file() **kwargs Other keyword arguments, passed to bokeh.figure. These will override all defaults set by visualize. Returns ------- The completed bokeh plot object. """ bp = import_required("bokeh.plotting", _BOKEH_MISSING_MSG) from bokeh.io import state if "file_path" in kwargs: warnings.warn( "The file_path keyword argument is deprecated " "and will be removed in a future release. " "Please use filename instead.", category=FutureWarning, stacklevel=2, ) filename = kwargs.pop("file_path") if save is None: save = not state.curstate().notebook if not isinstance(profilers, list): profilers = [profilers] figs = [prof._plot(**kwargs) for prof in profilers] # Stack the plots if len(figs) == 1: p = figs[0] else: top = figs[0] for f in figs[1:]: f.x_range = top.x_range f.title = None f.min_border_top = 20 if BOKEH_VERSION().major < 3: f.plot_height -= 30 else: f.height -= 30 for f in figs[:-1]: f.xaxis.axis_label = None f.min_border_bottom = 20 if BOKEH_VERSION().major < 3: f.plot_height -= 30 else: f.height -= 30 for f in figs: f.min_border_left = 75 f.min_border_right = 75 p = bp.gridplot([[f] for f in figs]) if show: bp.show(p) if save: bp.output_file(filename, mode=mode) bp.save(p) return p
def percentile(a, q, method="linear", internal_method="default", **kwargs): """Approximate percentile of 1-D array Parameters ---------- a : Array q : array_like of float Percentile or sequence of percentiles to compute, which must be between 0 and 100 inclusive. method : {'linear', 'lower', 'higher', 'midpoint', 'nearest'}, optional The interpolation method to use when the desired percentile lies between two data points ``i < j``. Only valid for ``method='dask'``. - 'linear': ``i + (j - i) * fraction``, where ``fraction`` is the fractional part of the index surrounded by ``i`` and ``j``. - 'lower': ``i``. - 'higher': ``j``. - 'nearest': ``i`` or ``j``, whichever is nearest. - 'midpoint': ``(i + j) / 2``. .. versionchanged:: 2022.1.0 This argument was previously called "interpolation" internal_method : {'default', 'dask', 'tdigest'}, optional What internal method to use. By default will use dask's internal custom algorithm (``'dask'``). If set to ``'tdigest'`` will use tdigest for floats and ints and fallback to the ``'dask'`` otherwise. .. versionchanged:: 2022.1.0 This argument was previously called “method”. interpolation : str, optional Deprecated name for the method keyword argument. .. deprecated:: 2022.1.0 See Also -------- numpy.percentile : Numpy's equivalent Percentile function """ from .dispatch import percentile_lookup as _percentile from .utils import array_safe, meta_from_array allowed_internal_methods = ["default", "dask", "tdigest"] if method in allowed_internal_methods: warnings.warn( "In Dask 2022.1.0, the `method=` argument was renamed to `internal_method=`", FutureWarning, ) internal_method = method if "interpolation" in kwargs: warnings.warn( "In Dask 2022.1.0, the `interpolation=` argument to percentile was renamed to " "`method= ` ", FutureWarning, ) method = kwargs.pop("interpolation") if kwargs: raise TypeError( f"percentile() got an unexpected keyword argument {kwargs.keys()}") if not a.ndim == 1: raise NotImplementedError( "Percentiles only implemented for 1-d arrays") if isinstance(q, Number): q = [q] q = array_safe(q, like=meta_from_array(a)) token = tokenize(a, q, method) dtype = a.dtype if np.issubdtype(dtype, np.integer): dtype = (array_safe([], dtype=dtype, like=meta_from_array(a)) / 0.5).dtype meta = meta_from_array(a, dtype=dtype) if internal_method not in allowed_internal_methods: raise ValueError( f"`internal_method=` must be one of {allowed_internal_methods}") # Allow using t-digest if method is allowed and dtype is of floating or integer type if (internal_method == "tdigest" and method == "linear" and (np.issubdtype(dtype, np.floating) or np.issubdtype(dtype, np.integer))): from dask.utils import import_required import_required( "crick", "crick is a required dependency for using the t-digest method.") name = "percentile_tdigest_chunk-" + token dsk = {(name, i): (_tdigest_chunk, key) for i, key in enumerate(a.__dask_keys__())} name2 = "percentile_tdigest-" + token dsk2 = {(name2, 0): (_percentiles_from_tdigest, q, sorted(dsk))} # Otherwise use the custom percentile algorithm else: # Add 0 and 100 during calculation for more robust behavior (hopefully) calc_q = np.pad(q, 1, mode="constant") calc_q[-1] = 100 name = "percentile_chunk-" + token dsk = {(name, i): (_percentile, key, calc_q, method) for i, key in enumerate(a.__dask_keys__())} name2 = "percentile-" + token dsk2 = { (name2, 0): ( merge_percentiles, q, [calc_q] * len(a.chunks[0]), sorted(dsk), method, ) } dsk = merge(dsk, dsk2) graph = HighLevelGraph.from_collections(name2, dsk, dependencies=[a]) return Array(graph, name2, chunks=((len(q), ), ), meta=meta)
from .threadpoolexecutor import rejoin from .utils import CancelledError, TimeoutError, sync from .variable import Variable from .worker import Reschedule, Worker, get_client, get_worker, secede from .worker_client import local_client, worker_client versions = get_versions() __version__ = versions["version"] __git_revision__ = versions["full-revisionid"] del get_versions, versions if dask.config.get("distributed.admin.event-loop") in ("asyncio", "tornado"): pass elif dask.config.get("distributed.admin.event-loop") == "uvloop": import_required( "uvloop", "The distributed.admin.event-loop configuration value " "is set to 'uvloop' but the uvloop module is not installed" "\n\n" "Please either change the config value or install one of the following\n" " conda install uvloop\n" " pip install uvloop", ) import uvloop uvloop.install() else: raise ValueError( "Expected distributed.admin.event-loop to be in ('asyncio', 'tornado', 'uvloop'), got %s" % dask.config.get("distributed.admin.event-loop"))
import os import re from functools import partial from dask.core import get_dependencies, ishashable, istask from dask.utils import apply, funcname, import_required, key_split graphviz = import_required( "graphviz", "Drawing dask graphs requires the `graphviz` python library and the " "`graphviz` system library.\n\n" "Please either conda or pip install as follows:\n\n" " conda install python-graphviz # either conda install\n" " python -m pip install graphviz # or pip install and follow installation instructions", ) def task_label(task): """Label for a task on a dot graph. Examples -------- >>> from operator import add >>> task_label((add, 1, 2)) 'add' >>> task_label((add, (add, 1, 2), 3)) 'add(...)' """ func = task[0] if func is apply: func = task[1]
def to_graphviz( hg, data_attributes=None, function_attributes=None, rankdir="BT", graph_attr=None, node_attr=None, edge_attr=None, **kwargs, ): from dask.dot import label, name graphviz = import_required( "graphviz", "Drawing dask graphs with the graphviz visualization engine requires the `graphviz` " "python library and the `graphviz` system library.\n\n" "Please either conda or pip install as follows:\n\n" " conda install python-graphviz # either conda install\n" " python -m pip install graphviz # or pip install and follow installation instructions", ) data_attributes = data_attributes or {} function_attributes = function_attributes or {} graph_attr = graph_attr or {} node_attr = node_attr or {} edge_attr = edge_attr or {} graph_attr["rankdir"] = rankdir node_attr["shape"] = "box" node_attr["fontname"] = "helvetica" graph_attr.update(kwargs) g = graphviz.Digraph(graph_attr=graph_attr, node_attr=node_attr, edge_attr=edge_attr) n_tasks = {} for layer in hg.dependencies: n_tasks[layer] = len(hg.layers[layer]) min_tasks = min(n_tasks.values()) max_tasks = max(n_tasks.values()) cache = {} color = kwargs.get("color") if color == "layer_type": layer_colors = { "DataFrameIOLayer": ["#CCC7F9", False], # purple "ShuffleLayer": ["#F9CCC7", False], # rose "SimpleShuffleLayer": ["#F9CCC7", False], # rose "ArrayOverlayLayer": ["#FFD9F2", False], # pink "BroadcastJoinLayer": ["#D9F2FF", False], # blue "Blockwise": ["#D9FFE6", False], # green "BlockwiseLayer": ["#D9FFE6", False], # green "MaterializedLayer": ["#DBDEE5", False], # gray } for layer in hg.dependencies: layer_name = name(layer) attrs = data_attributes.get(layer, {}) node_label = label(layer, cache=cache) node_size = (20 if max_tasks == min_tasks else int(20 + ((n_tasks[layer] - min_tasks) / (max_tasks - min_tasks)) * 20)) layer_type = str(type(hg.layers[layer]).__name__) node_tooltips = ( f"A {layer_type.replace('Layer', '')} Layer with {n_tasks[layer]} Tasks.\n" ) layer_ca = hg.layers[layer].collection_annotations if layer_ca: if layer_ca.get("type") == "dask.array.core.Array": node_tooltips += ( f"Array Shape: {layer_ca.get('shape')}\n" f"Data Type: {layer_ca.get('dtype')}\n" f"Chunk Size: {layer_ca.get('chunksize')}\n" f"Chunk Type: {layer_ca.get('chunk_type')}\n") if layer_ca.get("type") == "dask.dataframe.core.DataFrame": dftype = {"pandas.core.frame.DataFrame": "pandas"} cols = layer_ca.get("columns") node_tooltips += ( f"Number of Partitions: {layer_ca.get('npartitions')}\n" f"DataFrame Type: {dftype.get(layer_ca.get('dataframe_type'))}\n" f"{len(cols)} DataFrame Columns: {str(cols) if len(str(cols)) <= 40 else '[...]'}\n" ) attrs.setdefault("label", str(node_label)) attrs.setdefault("fontsize", str(node_size)) attrs.setdefault("tooltip", str(node_tooltips)) if color == "layer_type": node_color = layer_colors.get(layer_type)[0] layer_colors.get(layer_type)[1] = True attrs.setdefault("fillcolor", str(node_color)) attrs.setdefault("style", "filled") g.node(layer_name, **attrs) for layer, deps in hg.dependencies.items(): layer_name = name(layer) for dep in deps: dep_name = name(dep) g.edge(dep_name, layer_name) if color == "layer_type": legend_title = "Key" legend_label = ( '<<TABLE BORDER="0" CELLBORDER="1" CELLSPACING="0" CELLPADDING="5">' "<TR><TD><B>Legend: Layer types</B></TD></TR>") for layer_type, color in layer_colors.items(): if color[1]: legend_label += f'<TR><TD BGCOLOR="{color[0]}">{layer_type}</TD></TR>' legend_label += "</TABLE>>" attrs = data_attributes.get(legend_title, {}) attrs.setdefault("label", str(legend_label)) attrs.setdefault("fontsize", "20") attrs.setdefault("margin", "0") g.node(legend_title, **attrs) return g
def percentile(a, q, interpolation='linear', method='default'): """ Approximate percentile of 1-D array Parameters ---------- a : Array q : array_like of float Percentile or sequence of percentiles to compute, which must be between 0 and 100 inclusive. interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'}, optional The interpolation method to use when the desired percentile lies between two data points ``i < j``. Only valid for ``method='dask'``. * 'linear': ``i + (j - i) * fraction``, where ``fraction`` is the fractional part of the index surrounded by ``i`` and ``j``. * 'lower': ``i``. * 'higher': ``j``. * 'nearest': ``i`` or ``j``, whichever is nearest. * 'midpoint': ``(i + j) / 2``. method : {'default', 'dask', 'tdigest'}, optional What method to use. By default will use dask's internal custom algorithm (``'dask'``). If set to ``'tdigest'`` will use tdigest for floats and ints and fallback to the ``'dask'`` otherwise. See Also -------- numpy.percentile : Numpy's equivalent Percentile function """ if not a.ndim == 1: raise NotImplementedError( "Percentiles only implemented for 1-d arrays") if isinstance(q, Number): q = [q] q = np.array(q) token = tokenize(a, q, interpolation) dtype = a.dtype if np.issubdtype(dtype, np.integer): dtype = (np.array([], dtype=dtype) / 0.5).dtype allowed_methods = ['default', 'dask', 'tdigest'] if method not in allowed_methods: raise ValueError("method can only be 'default', 'dask' or 'tdigest'") if method == 'default': internal_method = 'dask' else: internal_method = method # Allow using t-digest if interpolation is allowed and dtype is of floating or integer type if (internal_method == 'tdigest' and interpolation == 'linear' and (np.issubdtype(dtype, np.floating) or np.issubdtype(dtype, np.integer))): from dask.utils import import_required import_required('crick', 'crick is a required dependency for using the t-digest ' 'method.') name = 'percentile_tdigest_chunk-' + token dsk = dict(((name, i), (_tdigest_chunk, (key))) for i, key in enumerate(a.__dask_keys__())) name2 = 'percentile_tdigest-' + token dsk2 = {(name2, 0): (_percentiles_from_tdigest, q, sorted(dsk))} # Otherwise use the custom percentile algorithm else: name = 'percentile_chunk-' + token dsk = dict(((name, i), (_percentile, (key), q, interpolation)) for i, key in enumerate(a.__dask_keys__())) name2 = 'percentile-' + token dsk2 = {(name2, 0): (merge_percentiles, q, [q] * len(a.chunks[0]), sorted(dsk), interpolation)} dsk = merge(dsk, dsk2) graph = HighLevelGraph.from_collections(name2, dsk, dependencies=[a]) return Array(graph, name2, chunks=((len(q),),), dtype=dtype)
def percentile(a, q, interpolation="linear", method="default"): """Approximate percentile of 1-D array Parameters ---------- a : Array q : array_like of float Percentile or sequence of percentiles to compute, which must be between 0 and 100 inclusive. interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'}, optional The interpolation method to use when the desired percentile lies between two data points ``i < j``. Only valid for ``method='dask'``. - 'linear': ``i + (j - i) * fraction``, where ``fraction`` is the fractional part of the index surrounded by ``i`` and ``j``. - 'lower': ``i``. - 'higher': ``j``. - 'nearest': ``i`` or ``j``, whichever is nearest. - 'midpoint': ``(i + j) / 2``. method : {'default', 'dask', 'tdigest'}, optional What method to use. By default will use dask's internal custom algorithm (``'dask'``). If set to ``'tdigest'`` will use tdigest for floats and ints and fallback to the ``'dask'`` otherwise. See Also -------- numpy.percentile : Numpy's equivalent Percentile function """ if not a.ndim == 1: raise NotImplementedError( "Percentiles only implemented for 1-d arrays") if isinstance(q, Number): q = [q] q = np.array(q) token = tokenize(a, q, interpolation) dtype = a.dtype if np.issubdtype(dtype, np.integer): dtype = (np.array([], dtype=dtype) / 0.5).dtype allowed_methods = ["default", "dask", "tdigest"] if method not in allowed_methods: raise ValueError("method can only be 'default', 'dask' or 'tdigest'") if method == "default": internal_method = "dask" else: internal_method = method # Allow using t-digest if interpolation is allowed and dtype is of floating or integer type if (internal_method == "tdigest" and interpolation == "linear" and (np.issubdtype(dtype, np.floating) or np.issubdtype(dtype, np.integer))): from dask.utils import import_required import_required( "crick", "crick is a required dependency for using the t-digest method.") name = "percentile_tdigest_chunk-" + token dsk = dict(((name, i), (_tdigest_chunk, key)) for i, key in enumerate(a.__dask_keys__())) name2 = "percentile_tdigest-" + token dsk2 = {(name2, 0): (_percentiles_from_tdigest, q, sorted(dsk))} # Otherwise use the custom percentile algorithm else: # Add 0 and 100 during calculation for more robust behavior (hopefully) calc_q = np.pad(q, 1, mode="constant") calc_q[-1] = 100 name = "percentile_chunk-" + token dsk = dict(((name, i), (_percentile, key, calc_q, interpolation)) for i, key in enumerate(a.__dask_keys__())) name2 = "percentile-" + token dsk2 = { (name2, 0): ( merge_percentiles, q, [calc_q] * len(a.chunks[0]), sorted(dsk), interpolation, ) } dsk = merge(dsk, dsk2) graph = HighLevelGraph.from_collections(name2, dsk, dependencies=[a]) return Array(graph, name2, chunks=((len(q), ), ), dtype=dtype)
def to_avro(b, filename, schema, name_function=None, storage_options=None, codec='null', sync_interval=16000, metadata=None, compute=True, **kwargs): """Write bag to set of avro files The schema is a complex dictionary dscribing the data, see https://avro.apache.org/docs/1.8.2/gettingstartedpython.html#Defining+a+schema and https://fastavro.readthedocs.io/en/latest/writer.html . It's structure is as follows:: {'name': 'Test', 'namespace': 'Test', 'doc': 'Descriptive text', 'type': 'record', 'fields': [ {'name': 'a', 'type': 'int'}, ]} where the "name" field is required, but "namespace" and "doc" are optional descriptors; "type" must always be "record". The list of fields should have an entry for every key of the input records, and the types are like the primitive, complex or logical types of the Avro spec ( https://avro.apache.org/docs/1.8.2/spec.html ). Results in one avro file per input partition. Parameters ---------- b: dask.bag.Bag filename: list of str or str Filenames to write to. If a list, number must match the number of partitions. If a string, must includ a glob character "*", which will be expanded using name_function schema: dict Avro schema dictionary, see above name_function: None or callable Expands integers into strings, see ``dask.bytes.utils.build_name_function`` storage_options: None or dict Extra key/value options to pass to the backend file-system codec: 'null', 'deflate', or 'snappy' Compression algorithm sync_interval: int Number of records to include in each block within a file metadata: None or dict Included in the file header compute: bool If True, files are written immediately, and function blocks. If False, returns delayed objects, which can be computed by the user where convenient. kwargs: passed to compute(), if compute=True Examples -------- >>> import dask.bag as db >>> b = db.from_sequence([{'name': 'Alice', 'value': 100}, ... {'name': 'Bob', 'value': 200}]) >>> schema = {'name': 'People', 'doc': "Set of people's scores", ... 'type': 'record', ... 'fields': [ ... {'name': 'name', 'type': 'string'}, ... {'name': 'value', 'type': 'int'}]} >>> b.to_avro('my-data.*.avro', schema) # doctest: +SKIP ['my-data.0.avro', 'my-data.1.avro'] """ # TODO infer schema from first partition of data from dask.utils import import_required from dask.bytes.core import open_files import_required('fastavro', "fastavro is a required dependency for using " "bag.to_avro().") _verify_schema(schema) storage_options = storage_options or {} files = open_files(filename, 'wb', name_function=name_function, num=b.npartitions, **storage_options) name = 'to-avro-' + uuid.uuid4().hex dsk = {(name, i): (_write_avro_part, (b.name, i), f, schema, codec, sync_interval, metadata) for i, f in enumerate(files)} graph = HighLevelGraph.from_collections(name, dsk, dependencies=[b]) out = type(b)(graph, name, b.npartitions) if compute: out.compute(**kwargs) return [f.path for f in files] else: return out.to_delayed()