def apply_conjunction(parts, statistics, conjunction): for column, operator, value in conjunction: out_parts = [] out_statistics = [] for part, stats in zip(parts, statistics): if "filter" in stats and stats["filter"]: continue # Filtered by engine try: c = toolz.groupby("name", stats["columns"])[column][0] min = c["min"] max = c["max"] except KeyError: out_parts.append(part) out_statistics.append(stats) else: if ( operator == "==" and min <= value <= max or operator == "<" and min < value or operator == "<=" and min <= value or operator == ">" and max > value or operator == ">=" and max >= value or operator == "in" and any(min <= item <= max for item in value) ): out_parts.append(part) out_statistics.append(stats) parts, statistics = out_parts, out_statistics return parts, statistics
def collections_to_dsk(collections, optimize_graph=True, optimizations=(), **kwargs): """ Convert many collections into a single dask graph, after optimization """ from .highlevelgraph import HighLevelGraph optimizations = tuple(optimizations) + tuple(config.get("optimizations", ())) if optimize_graph: groups = groupby(optimization_function, collections) graphs = [] for opt, val in groups.items(): dsk, keys = _extract_graph_and_keys(val) dsk = opt(dsk, keys, **kwargs) for opt in optimizations: dsk = opt(dsk, keys, **kwargs) graphs.append(dsk) # Merge all graphs if any(isinstance(graph, HighLevelGraph) for graph in graphs): dsk = HighLevelGraph.merge(*graphs) else: dsk = merge(*map(ensure_dict, graphs)) else: dsk, _ = _extract_graph_and_keys(collections) return dsk
def test_groupby_tasks_3(): func = lambda x: x % 10 b = db.range(20, npartitions=5).groupby(func, shuffle="tasks", max_branch=2) result = b.compute(scheduler="sync") assert dict(result) == groupby(func, range(20))
def collections_to_dsk(collections, optimize_graph=True, **kwargs): """ Convert many collections into a single dask graph, after optimization """ optimizations = kwargs.pop("optimizations", None) or config.get("optimizations", []) if optimize_graph: groups = groupby(optimization_function, collections) _opt_list = [] for opt, val in groups.items(): dsk, keys = _extract_graph_and_keys(val) _opt = opt(dsk, keys, **kwargs) groups[opt] = (_opt, keys) _opt_list.append(_opt) for opt in optimizations: _opt_list = [] group = {} for k, (dsk, keys) in groups.items(): _opt = opt(dsk, keys, **kwargs) group[k] = (_opt, keys) _opt_list.append(_opt) groups = group dsk = merge(*map(ensure_dict, _opt_list,)) else: dsk, _ = _extract_graph_and_keys(collections) return dsk
def handle_funcs(group): groups = tlz.groupby('group', group) for name in sorted(groups): yield '' yield f' # {name}' for info in groups[name]: yield f' {info["ctext_t"]}'
async def scatter_to_workers(nthreads, data, rpc=rpc, report=True, serializers=None): """ Scatter data directly to workers This distributes data in a round-robin fashion to a set of workers based on how many cores they have. nthreads should be a dictionary mapping worker identities to numbers of cores. See scatter for parameter docstring """ assert isinstance(nthreads, dict) assert isinstance(data, dict) workers = list(concat([w] * nc for w, nc in nthreads.items())) names, data = list(zip(*data.items())) worker_iter = drop(_round_robin_counter[0] % len(workers), cycle(workers)) _round_robin_counter[0] += len(data) L = list(zip(worker_iter, names, data)) d = groupby(0, L) d = { worker: {key: value for _, key, value in v} for worker, v in d.items() } rpcs = {addr: rpc(addr) for addr in d} try: out = await All([ rpcs[address].update_data(data=v, report=report, serializers=serializers) for address, v in d.items() ]) finally: for r in rpcs.values(): await r.close_rpc() nbytes = merge(o["nbytes"] for o in out) who_has = {k: [w for w, _, _ in v] for k, v in groupby(1, L).items()} return (names, who_has, nbytes)
def broadcast_dimensions(argpairs, numblocks, sentinels=(1, (1, )), consolidate=None): """Find block dimensions from arguments Parameters ---------- argpairs: iterable name, ijk index pairs numblocks: dict maps {name: number of blocks} sentinels: iterable (optional) values for singleton dimensions consolidate: func (optional) use this to reduce each set of common blocks into a smaller set Examples -------- >>> argpairs = [('x', 'ij'), ('y', 'ji')] >>> numblocks = {'x': (2, 3), 'y': (3, 2)} >>> broadcast_dimensions(argpairs, numblocks) {'i': 2, 'j': 3} Supports numpy broadcasting rules >>> argpairs = [('x', 'ij'), ('y', 'ij')] >>> numblocks = {'x': (2, 1), 'y': (1, 3)} >>> broadcast_dimensions(argpairs, numblocks) {'i': 2, 'j': 3} Works in other contexts too >>> argpairs = [('x', 'ij'), ('y', 'ij')] >>> d = {'x': ('Hello', 1), 'y': (1, (2, 3))} >>> broadcast_dimensions(argpairs, d) {'i': 'Hello', 'j': (2, 3)} """ # List like [('i', 2), ('j', 1), ('i', 1), ('j', 2)] argpairs2 = [(a, ind) for a, ind in argpairs if ind is not None] L = toolz.concat([ zip(inds, dims) for (x, inds), (x, dims) in toolz.join( toolz.first, argpairs2, toolz.first, numblocks.items()) ]) g = toolz.groupby(0, L) g = dict((k, set([d for i, d in v])) for k, v in g.items()) g2 = dict( (k, v - set(sentinels) if len(v) > 1 else v) for k, v in g.items()) if consolidate: return toolz.valmap(consolidate, g2) if g2 and not set(map(len, g2.values())) == set([1]): raise ValueError("Shapes do not align %s" % g) return toolz.valmap(toolz.first, g2)
def collections_to_dsk(collections, optimize_graph=True, **kwargs): """ Convert many collections into a single dask graph, after optimization """ from .highlevelgraph import HighLevelGraph optimizations = kwargs.pop("optimizations", None) or config.get( "optimizations", []) if optimize_graph: groups = groupby(optimization_function, collections) _opt_list = [] for opt, val in groups.items(): dsk, keys = _extract_graph_and_keys(val) groups[opt] = (dsk, keys) _opt = opt(dsk, keys, **kwargs) _opt_list.append(_opt) for opt in optimizations: _opt_list = [] group = {} for k, (dsk, keys) in groups.items(): _opt = opt(dsk, keys, **kwargs) group[k] = (_opt, keys) _opt_list.append(_opt) groups = group # Merge all graphs if any(isinstance(graph, HighLevelGraph) for graph in _opt_list): dsk = HighLevelGraph.merge(*_opt_list) else: dsk = merge(*map(ensure_dict, _opt_list)) else: dsk, _ = _extract_graph_and_keys(collections) return dsk
def plot_cache(results, dsk, start_time, metric_name, palette="Viridis", label_size=60, **kwargs): """Visualize the results of profiling in a bokeh plot. Parameters ---------- results : sequence Output of CacheProfiler.results dsk : dict The dask graph being profiled. start_time : float Start time of the profile. metric_name : string Metric used to measure cache size palette : string, optional Name of the bokeh palette to use, must be a member of bokeh.palettes.all_palettes. label_size: int (optional) Maximum size of output labels in plot, defaults to 60 **kwargs Other keyword arguments, passed to bokeh.figure. These will override all defaults set by visualize. Returns ------- The completed bokeh plot object. """ bp = import_required("bokeh.plotting", _BOKEH_MISSING_MSG) from bokeh.models import HoverTool defaults = dict( title="Profile Results", tools="hover,save,reset,wheel_zoom,xpan", toolbar_location="above", width=800, height=300, ) # Support plot_width and plot_height for backwards compatibility if "plot_width" in kwargs: kwargs["width"] = kwargs.pop("plot_width") if "plot_height" in kwargs: kwargs["height"] = kwargs.pop("plot_height") defaults.update( (k, v) for (k, v) in kwargs.items() if k in _get_figure_keywords()) if results: starts, ends = list(zip(*results))[3:] tics = sorted(unique(starts + ends)) groups = groupby(lambda d: pprint_task(d[1], dsk, label_size), results) data = {} for k, vals in groups.items(): cnts = dict.fromkeys(tics, 0) for v in vals: cnts[v.cache_time] += v.metric cnts[v.free_time] -= v.metric data[k] = [0] + list( accumulate(add, pluck(1, sorted(cnts.items())))) tics = [0] + [i - start_time for i in tics] p = bp.figure(x_range=[0, max(tics)], **defaults) for (key, val), color in zip(data.items(), get_colors(palette, data.keys())): p.line( "x", "y", line_color=color, line_width=3, source=bp.ColumnDataSource({ "x": tics, "y": val, "label": [key for i in val] }), ) else: p = bp.figure(y_range=[0, 10], x_range=[0, 10], **defaults) p.yaxis.axis_label = "Cache Size ({0})".format(metric_name) p.xaxis.axis_label = "Time (s)" hover = p.select(HoverTool) hover.tooltips = """ <div> <span style="font-size: 14px; font-weight: bold;">Task:</span> <span style="font-size: 10px; font-family: Monaco, monospace;">@label</span> </div> """ return p
def plot_tasks(results, dsk, palette="Viridis", label_size=60, **kwargs): """Visualize the results of profiling in a bokeh plot. Parameters ---------- results : sequence Output of Profiler.results dsk : dict The dask graph being profiled. palette : string, optional Name of the bokeh palette to use, must be a member of bokeh.palettes.all_palettes. label_size: int (optional) Maximum size of output labels in plot, defaults to 60 **kwargs Other keyword arguments, passed to bokeh.figure. These will override all defaults set by visualize. Returns ------- The completed bokeh plot object. """ bp = import_required("bokeh.plotting", _BOKEH_MISSING_MSG) from bokeh.models import HoverTool defaults = dict( title="Profile Results", tools="hover,save,reset,xwheel_zoom,xpan", toolbar_location="above", width=800, height=300, ) # Support plot_width and plot_height for backwards compatibility if "plot_width" in kwargs: kwargs["width"] = kwargs.pop("plot_width") if "plot_height" in kwargs: kwargs["height"] = kwargs.pop("plot_height") defaults.update( (k, v) for (k, v) in kwargs.items() if k in _get_figure_keywords()) if results: keys, tasks, starts, ends, ids = zip(*results) id_group = groupby(itemgetter(4), results) timings = dict((k, [i.end_time - i.start_time for i in v]) for (k, v) in id_group.items()) id_lk = dict((t[0], n) for (n, t) in enumerate( sorted(timings.items(), key=itemgetter(1), reverse=True))) left = min(starts) right = max(ends) p = bp.figure(y_range=[str(i) for i in range(len(id_lk))], x_range=[0, right - left], **defaults) data = {} data["width"] = width = [e - s for (s, e) in zip(starts, ends)] data["x"] = [w / 2 + s - left for (w, s) in zip(width, starts)] data["y"] = [id_lk[i] + 1 for i in ids] data["function"] = funcs = [ pprint_task(i, dsk, label_size) for i in tasks ] data["color"] = get_colors(palette, funcs) data["key"] = [str(i) for i in keys] source = bp.ColumnDataSource(data=data) p.rect( source=source, x="x", y="y", height=1, width="width", color="color", line_color="gray", ) else: p = bp.figure(y_range=[str(i) for i in range(8)], x_range=[0, 10], **defaults) p.grid.grid_line_color = None p.axis.axis_line_color = None p.axis.major_tick_line_color = None p.yaxis.axis_label = "Worker ID" p.xaxis.axis_label = "Time (s)" hover = p.select(HoverTool) hover.tooltips = """ <div> <span style="font-size: 14px; font-weight: bold;">Key:</span> <span style="font-size: 10px; font-family: Monaco, monospace;">@key</span> </div> <div> <span style="font-size: 14px; font-weight: bold;">Task:</span> <span style="font-size: 10px; font-family: Monaco, monospace;">@function</span> </div> """ hover.point_policy = "follow_mouse" return p
def test_groupby_tasks_2(size, npartitions, groups): func = lambda x: x % groups b = db.range(size, npartitions=npartitions).groupby(func, shuffle="tasks") result = b.compute(scheduler="sync") assert dict(result) == groupby(func, range(size))