def getOrderOfExecution(dsk,finalTask, numConcurrentTasks=1): '''Get the optimal order to minimizing backtracking and memory footprint. numConcurrentTasks can be more than 1 if you think you can do two things at once. Put the final task you want to achieve (one of the keys in depDict)''' from dask.callbacks import Callback #from multiprocessing.pool import ThreadPool from dask.threaded import get #dask.set_options(pool=ThreadPool(numConcurrentTasks)) class PrintKeys(Callback): def __init__(self,numWorkers): self.equivTime = None self.numWorkers = numWorkers def _start(self,dsk): print("Working with {} concurrent tasks".format(self.numWorkers)) self.startTime = clock() def _pretask(self, key, dask, state): """Print the key of every task as it's started""" pass def _posttask(self,key,result,dsk,state,id): print("Do {} <- {}, approx. {} time units".format(repr(key),dsk[key][1:],repr(result))) def _finish(self,dsk,state,errored): self.endTime = clock() dt = (self.endTime - self.startTime)*100. print("Approximate time to complete: {} time units".format(dt)) print("Equivalent single thread time: {} time units".format(dt*self.numWorkers)) self.equivTime = dt*self.numWorkers with PrintKeys(numConcurrentTasks): get(dsk,finalTask,num_workers=numConcurrentTasks)
def test_resource_profiler_plot(): with ResourceProfiler(dt=0.01) as rprof: get(dsk2, 'c') p = rprof.visualize(plot_width=500, plot_height=300, tools="hover", title="Not the default", show=False, save=False) assert p.plot_width == 500 assert p.plot_height == 300 assert len(p.tools) == 1 assert isinstance(p.tools[0], bokeh.models.HoverTool) assert check_title(p, "Not the default") # Test with empty and one point, checking for errors rprof.clear() for results in [[], [(1.0, 0, 0)]]: rprof.results = results with pytest.warns(None) as record: p = rprof.visualize(show=False, save=False) assert len(record) == 0 # Check bounds are valid assert p.x_range.start == 0 assert p.x_range.end == 1 assert p.y_range.start == 0 assert p.y_range.end == 100 assert p.extra_y_ranges['memory'].start == 0 assert p.extra_y_ranges['memory'].end == 100
def getOptimalNumThreads(dsk,finalTask): threads = np.arange(9)+1#physical limit? from dask.callbacks import Callback #from multiprocessing.pool import ThreadPool from dask.threaded import get #dask.set_options(pool=ThreadPool(numConcurrentTasks)) equivTime = 0 class PrintKeys(Callback): def __init__(self,numWorkers): self.equivTime = None self.numWorkers = numWorkers def _start(self,dsk): self.startTime = clock() def _finish(self,dsk,state,errored): self.endTime = clock() dt = (self.endTime - self.startTime)*100. self.equivTime = dt*self.numWorkers equivTime = self.equivTime eqT = [] eff = [] optnum = 1 for i in threads: with PrintKeys(i) as pk: get(dsk,finalTask,num_workers=i) eqT.append(pk.equivTime) print("Efficiency [{}]: {}".format(i,1-pk.equivTime/eqT[0]/i)) ef = 1-pk.equivTime/eqT[0]/i eff.append(ef) if len(eff) > 1: if ef/eff[-2] < 1.1: optnum = i - 1 break print("Optimal number of concurrent tasks (if possible): {}".format(optnum)) return optnum
def test_blockiter_bothmasks(signal, flat, dtype, nm, sm): real_first = get(signal.data.dask, (signal.data.name, 0, 0, 0, 0)).copy() real_second = get(signal.data.dask, (signal.data.name, 0, 1, 0, 0)).copy() # Don't want to rechunk, so change dtype manually signal.data = signal.data.astype(dtype) it = signal._block_iterator(flat_signal=flat, navigation_mask=nm, signal_mask=sm, get=get) first_block = next(it) second_block = next(it) if nm is not None: nm = nm[:2, :4] real_first = real_first.astype(dtype) real_second = real_second.astype(dtype) if flat: if nm is not None: nm = ~nm navslice = np.where(nm.flat)[0] else: navslice = slice(None) sigslice = slice(11, None) if sm is not None else slice(None) slices1 = (navslice, sigslice) real_first = real_first.reshape((2 * 4, -1))[slices1] real_second = real_second.reshape((2 * 5, -1))[:, sigslice] else: value = np.nan if dtype is 'float' else 0 if nm is not None: real_first[nm, ...] = value if sm is not None: real_first[..., sm] = value real_second[..., sm] = value np.testing.assert_allclose(first_block, real_first) np.testing.assert_allclose(second_block, real_second)
def test_interrupt(): # Python 2 and windows 2 & 3 both implement `queue.get` using polling, # which means we can set an exception to interrupt the call to `get`. # Python 3 on other platforms requires sending SIGINT to the main thread. if PY2: from thread import interrupt_main elif os.name == 'nt': from _thread import interrupt_main else: main_thread = threading.get_ident() def interrupt_main(): signal.pthread_kill(main_thread, signal.SIGINT) def long_task(): sleep(5) dsk = {('x', i): (long_task,) for i in range(20)} dsk['x'] = (len, list(dsk.keys())) try: interrupter = threading.Timer(0.5, interrupt_main) interrupter.start() start = time() get(dsk, 'x') except KeyboardInterrupt: pass except Exception: assert False, "Failed to interrupt" stop = time() if stop - start > 4: assert False, "Failed to interrupt"
def test_no_tasks(capsys): with ProgressBar(): get({'x': 1}, 'x') out, err = capsys.readouterr() bar, percent, time = [i.strip() for i in out.split('\r')[-1].split('|')] assert bar == "[########################################]" assert percent == "100% Completed"
def test_progressbar(capsys): with ProgressBar(): out = get(dsk, 'e') assert out == 6 check_bar_completed(capsys) with ProgressBar(width=20): out = get(dsk, 'e') check_bar_completed(capsys, 20)
def test_clean_exit(): dsk = {"a": (lambda: 1 / 0,)} try: with ProgressBar() as pbar: get(dsk, "a") except: pass assert not pbar._running assert not pbar._timer.is_alive()
def test_clean_exit(): dsk = {'a': (lambda: 1 / 0, )} try: with ProgressBar() as pbar: get(dsk, 'a') except ZeroDivisionError: pass assert not pbar._running assert not pbar._timer.is_alive()
def test_saves_file(): with tmpfile("html") as fn: with prof: get(dsk, "e") # Run just to see that it doesn't error prof.visualize(show=False, file_path=fn) assert os.path.exists(fn) with open(fn) as f: assert "html" in f.read().lower()
def test_with_alias(capsys): dsk = {'a': 1, 'b': 2, 'c': (add, 'a', 'b'), 'd': (add, 1, 2), 'e': 'd', 'f': (mul, 'e', 'c')} with ProgressBar(): get(dsk, 'f') check_bar_completed(capsys)
def test_profiler_works_under_error(): div = lambda x, y: x / y dsk = {'x': (div, 1, 1), 'y': (div, 'x', 2), 'z': (div, 'y', 0)} with ignoring(ZeroDivisionError): with prof: get(dsk, 'z') assert all(len(v) == 5 for v in prof.results) assert len(prof.results) == 2
def test_saves_file(): with tmpfile('html') as fn: with prof: get(dsk, 'e') # Run just to see that it doesn't error prof.visualize(show=False, file_path=fn) assert os.path.exists(fn) with open(fn) as f: assert 'HTML' in f.read()
def test_dont_spawn_too_many_threads(): before = threading.active_count() dsk = {('x', i): (lambda: i,) for i in range(10)} dsk['x'] = (sum, list(dsk)) for i in range(20): get(dsk, 'x', num_workers=4) after = threading.active_count() assert after <= before + 8
def test_register(profiler): prof = profiler() try: prof.register() get(dsk2, 'c') n = len(prof.results) assert n > 0 get(dsk2, 'c') assert len(prof.results) > n finally: prof.unregister()
def test_resource_profiler_plot(): with ResourceProfiler(dt=0.01) as rprof: get(dsk2, "c") p = rprof.visualize(plot_width=500, plot_height=300, tools="hover", title="Not the default", show=False, save=False) assert p.plot_width == 500 assert p.plot_height == 300 assert len(p.tools) == 1 assert isinstance(p.tools[0], bokeh.models.HoverTool) assert p.title == "Not the default" # Test empty, checking for errors rprof.clear() rprof.visualize(show=False, save=False)
def test_profiler_plot(): with prof: get(dsk, 'e') p = prof.visualize(plot_width=500, plot_height=300, tools="hover", title="Not the default", show=False, save=False) assert p.plot_width == 500 assert p.plot_height == 300 assert len(p.tools) == 1 assert isinstance(p.tools[0], bokeh.models.HoverTool) assert p.title == "Not the default"
def test_cache_profiler_plot(): with CacheProfiler(metric_name="non-standard") as cprof: get(dsk, "e") p = cprof.visualize(plot_width=500, plot_height=300, tools="hover", title="Not the default", show=False, save=False) assert p.plot_width == 500 assert p.plot_height == 300 assert len(p.tools) == 1 assert isinstance(p.tools[0], bokeh.models.HoverTool) assert p.title == "Not the default" assert p.axis[1].axis_label == "Cache Size (non-standard)" # Test empty, checking for errors cprof.clear() cprof.visualize(show=False, save=False)
def test_progressbar(capsys): with ProgressBar(): out = get(dsk, 'e') assert out == 6 out, err = capsys.readouterr() bar, percent, time = [i.strip() for i in out.split('\r')[-1].split('|')] assert bar == "[########################################]" assert percent == "100% Completed" with ProgressBar(width=20): out = get(dsk, 'e') out, err = capsys.readouterr() bar, percent, time = [i.strip() for i in out.split('\r')[-1].split('|')] assert bar == "[####################]" assert percent == "100% Completed"
def test_plot_both(): from dask.diagnostics.profile_visualize import visualize from bokeh.plotting import GridPlot with ResourceProfiler(dt=0.01) as rprof: with prof: get(dsk2, 'c') p = visualize([prof, rprof], label_size=50, title="Not the default", show=False, save=False) assert isinstance(p, GridPlot) assert len(p.children) == 2 assert p.children[0][0].title == "Not the default" assert p.children[0][0].xaxis[0].axis_label is None assert p.children[1][0].title is None assert p.children[1][0].xaxis[0].axis_label == 'Time (s)'
def test_broken_callback(): from dask.callbacks import Callback def _f_ok(*args, **kwargs): pass def _f_broken(*args, **kwargs): raise ValueError('my_exception') dsk = {'x': 1} with Callback(start=_f_broken, finish=_f_ok): with Callback(start=_f_ok, finish=_f_ok): with pytest.raises(ValueError, match='my_exception'): get(dsk, 'x')
def test_resource_profiler(): with ResourceProfiler(dt=0.01) as rprof: out = get(dsk2, 'c') results = rprof.results assert all(isinstance(i, tuple) and len(i) == 3 for i in results) rprof.clear() assert rprof.results == [] rprof.close() assert not rprof._tracker.is_alive() with pytest.raises(AssertionError): with rprof: get(dsk, 'e')
def test_register(capsys): try: p = ProgressBar() p.register() assert _globals['callbacks'] get(dsk, 'e') check_bar_completed(capsys) p.unregister() assert not _globals['callbacks'] finally: _globals['callbacks'].clear()
def test_resource_profiler(): with ResourceProfiler(dt=0.01) as rprof: get(dsk2, 'c') results = rprof.results assert len(results) > 0 assert all(isinstance(i, tuple) and len(i) == 3 for i in results) rprof.clear() assert rprof.results == [] rprof.close() assert not rprof._tracker.is_alive() with pytest.raises(AssertionError): with rprof: get(dsk, 'e')
def test_with_cache(capsys): cachey = pytest.importorskip('cachey') from dask.cache import Cache c = cachey.Cache(10000) cc = Cache(c) with cc: with ProgressBar(): assert get({'x': (mul, 1, 2)}, 'x') == 2 check_bar_completed(capsys) assert c.data['x'] == 2 with cc: with ProgressBar(): assert get({'x': (mul, 1, 2), 'y': (mul, 'x', 3)}, 'y') == 6 check_bar_completed(capsys)
def test_profiler_plot(): with prof: get(dsk, 'e') p = prof.visualize(plot_width=500, plot_height=300, tools="hover", title="Not the default", show=False, save=False) assert p.plot_width == 500 assert p.plot_height == 300 assert len(p.tools) == 1 assert isinstance(p.tools[0], bokeh.models.HoverTool) assert check_title(p, "Not the default") # Test empty, checking for errors prof.clear() prof.visualize(show=False, save=False)
def test_resource_profiler_plot(): with ResourceProfiler(dt=0.01) as rprof: get(dsk2, 'c') p = rprof.visualize(plot_width=500, plot_height=300, tools="hover", title="Not the default", show=False, save=False) assert p.plot_width == 500 assert p.plot_height == 300 assert len(p.tools) == 1 assert isinstance(p.tools[0], bokeh.models.HoverTool) assert p.title == "Not the default" # Test empty, checking for errors rprof.clear() rprof.visualize(show=False, save=False)
def test_cache_profiler_plot(): with CacheProfiler(metric_name='non-standard') as cprof: get(dsk, 'e') p = cprof.visualize(plot_width=500, plot_height=300, tools="hover", title="Not the default", show=False, save=False) assert p.plot_width == 500 assert p.plot_height == 300 assert len(p.tools) == 1 assert isinstance(p.tools[0], bokeh.models.HoverTool) assert check_title(p, "Not the default") assert p.axis[1].axis_label == 'Cache Size (non-standard)' # Test empty, checking for errors cprof.clear() cprof.visualize(show=False, save=False)
def test_two_gets(): with prof: get(dsk, "e") n = len(prof.results) dsk2 = {"x": (add, 1, 2), "y": (add, "x", "x")} with prof: get(dsk2, "y") m = len(prof.results) with prof: get(dsk, "e") get(dsk2, "y") get(dsk, "e") assert len(prof.results) == n + m + n
def test_two_gets(): with prof: get(dsk, 'e') n = len(prof.results) dsk2 = {'x': (add, 1, 2), 'y': (add, 'x', 'x')} with prof: get(dsk2, 'y') m = len(prof.results) with prof: get(dsk, 'e') get(dsk2, 'y') get(dsk, 'e') assert len(prof.results) == n + m + n
def test_callback(): f = lambda x: x + 1 dsk = {'a': (f, 1)} from dask.threaded import get def start_callback(key, d, state): assert key == 'a' or key is None assert d == dsk assert isinstance(state, dict) def end_callback(key, value, d, state, worker_id): assert key == 'a' or key is None assert value == 2 or value is None assert d == dsk assert isinstance(state, dict) get(dsk, 'a', start_callback=start_callback, end_callback=end_callback)
def test_callback(): f = lambda x: x + 1 dsk = {"a": (f, 1)} from dask.threaded import get def start_callback(key, d, state): assert key == "a" or key is None assert d == dsk assert isinstance(state, dict) def end_callback(key, value, d, state, worker_id): assert key == "a" or key is None assert value == 2 or value is None assert d == dsk assert isinstance(state, dict) get(dsk, "a", start_callback=start_callback, end_callback=end_callback)
def test_with_cache(capsys): cachey = pytest.importorskip("cachey") from dask.cache import Cache c = cachey.Cache(10000) cc = Cache(c) with cc: with ProgressBar(): assert get({"x": (mul, 1, 2)}, "x") == 2 check_bar_completed(capsys) assert c.data["x"] == 2 with cc: with ProgressBar(): assert get({"x": (mul, 1, 2), "y": (mul, "x", 3)}, "y") == 6 check_bar_completed(capsys)
def test_plot_multiple(): from dask.diagnostics.profile_visualize import visualize from bokeh.plotting import GridPlot with ResourceProfiler(dt=0.01) as rprof: with prof: get(dsk2, 'c') p = visualize([prof, rprof], label_size=50, title="Not the default", show=False, save=False) assert isinstance(p, GridPlot) assert len(p.children) == 2 assert p.children[0][0].title == "Not the default" assert p.children[0][0].xaxis[0].axis_label is None assert p.children[1][0].title is None assert p.children[1][0].xaxis[0].axis_label == 'Time (s)' # Test empty, checking for errors prof.clear() rprof.clear() visualize([prof, rprof], show=False, save=False)
def test_plot_multiple(): from dask.diagnostics.profile_visualize import visualize from bokeh.models import GridPlot with ResourceProfiler(dt=0.01) as rprof: with prof: get(dsk2, 'c') p = visualize([prof, rprof], label_size=50, title="Not the default", show=False, save=False) assert isinstance(p, GridPlot) assert len(p.children) == 2 assert p.children[0][0].title == "Not the default" assert p.children[0][0].xaxis[0].axis_label is None assert p.children[1][0].title is None assert p.children[1][0].xaxis[0].axis_label == 'Time (s)' # Test empty, checking for errors prof.clear() rprof.clear() visualize([prof, rprof], show=False, save=False)
def main(): d = { 'task_one': (task_one, ), 'task_two': (task_two, ), 'task_three': (task_three, 'task_one', 'task_two') } print(get(d, 'task_three')) # executes in parallel
def test_pool_kwarg(): def f(): sleep(0.01) return threading.get_ident() dsk = {('x', i): (f,) for i in range(30)} dsk['x'] = (len, (set, [('x', i) for i in range(len(dsk))])) with ThreadPool(3) as pool: assert get(dsk, 'x', pool=pool) == 3
def test_plot_multiple(): from dask.diagnostics.profile_visualize import visualize with ResourceProfiler(dt=0.01) as rprof: with prof: get(dsk2, "c") p = visualize( [prof, rprof], label_size=50, title="Not the default", show=False, save=False ) figures = [r[0] for r in p.children[1].children] assert len(figures) == 2 assert figures[0].title.text == "Not the default" assert figures[0].xaxis[0].axis_label is None assert figures[1].title is None assert figures[1].xaxis[0].axis_label == "Time (s)" # Test empty, checking for errors prof.clear() rprof.clear() visualize([prof, rprof], show=False, save=False)
def test_profiler_works_under_error(): div = lambda x, y: x / y dsk = {'x': (div, 1, 1), 'y': (div, 'x', 2), 'z': (div, 'y', 0)} with ignoring(ZeroDivisionError): with prof: out = get(dsk, 'z') assert all(len(v) == 5 for v in prof.results) assert len(prof.results) == 2
def test_pool_kwarg(pool_typ): def f(): sleep(0.01) return threading.get_ident() dsk = {("x", i): (f, ) for i in range(30)} dsk["x"] = (len, (set, [("x", i) for i in range(len(dsk))])) with pool_typ(3) as pool: assert get(dsk, "x", pool=pool) == 3
def test_profiler(): with prof: out = get(dsk, "e") assert out == 6 prof_data = sorted(prof.results, key=lambda d: d.key) keys = [i.key for i in prof_data] assert keys == ["c", "d", "e"] tasks = [i.task for i in prof_data] assert tasks == [(add, "a", "b"), (mul, "a", "b"), (mul, "c", "d")] prof.clear() assert prof.results == []
def test_plot_multiple(): from dask.diagnostics.profile_visualize import visualize with ResourceProfiler(dt=0.01) as rprof: with prof: get(dsk2, 'c') p = visualize([prof, rprof], label_size=50, title="Not the default", show=False, save=False) if LooseVersion(bokeh.__version__) >= '0.12.0': figures = [r.children[0] for r in p.children[1].children] else: figures = [r[0] for r in p.children] assert len(figures) == 2 assert check_title(figures[0], "Not the default") assert figures[0].xaxis[0].axis_label is None assert figures[1].title is None assert figures[1].xaxis[0].axis_label == 'Time (s)' # Test empty, checking for errors prof.clear() rprof.clear() visualize([prof, rprof], show=False, save=False)
def test_profiler(): with prof: out = get(dsk, 'e') assert out == 6 prof_data = sorted(prof.results, key=lambda d: d.key) keys = [i.key for i in prof_data] assert keys == ['c', 'd', 'e'] tasks = [i.task for i in prof_data] assert tasks == [(add, 'a', 'b'), (mul, 'a', 'b'), (mul, 'c', 'd')] prof.clear() assert prof.results == []
def test_resource_profiler(): with ResourceProfiler(dt=0.01) as rprof: get(dsk2, "c") results = rprof.results assert len(results) > 0 assert all(isinstance(i, tuple) and len(i) == 3 for i in results) # Tracker stopped on exit assert not rprof._is_running() rprof.clear() assert rprof.results == [] # Close is idempotent rprof.close() assert not rprof._is_running() # Restarts tracker if already closed with rprof: get(dsk2, "c") assert len(rprof.results) > 0
def testing_city_finder(): path = os.path.join(os.path.dirname(__file__), 'sample_transcripts', 'out-example-2021-02-01-hansard-plenary.txt') city_finder = processor() workflow = city_finder.build_workflow(path) report = get(workflow, 'output') with open('./tests/test_report.json', 'r') as json_file: expected_report = json.load(json_file) compiled_report = report.compile()
def test_cache_profiler(): with CacheProfiler() as cprof: out = get(dsk2, 'c') results = cprof.results assert all(isinstance(i, tuple) and len(i) == 5 for i in results) cprof.clear() assert cprof.results == [] tics = [0] def nbytes(res): tics[0] += 1 return tics[0] with CacheProfiler(nbytes) as cprof: out = get(dsk2, 'c') results = cprof.results assert tics[-1] == len(results) assert tics[-1] == results[-1].metric assert cprof._metric_name == 'nbytes' assert CacheProfiler(metric=nbytes, metric_name='foo')._metric_name == 'foo'
def getOptimalNumThreads(dsk, finalTask): threads = np.arange(9) + 1 #physical limit? from dask.callbacks import Callback #from multiprocessing.pool import ThreadPool from dask.threaded import get #dask.set_options(pool=ThreadPool(numConcurrentTasks)) equivTime = 0 class PrintKeys(Callback): def __init__(self, numWorkers): self.equivTime = None self.numWorkers = numWorkers def _start(self, dsk): self.startTime = clock() def _finish(self, dsk, state, errored): self.endTime = clock() dt = (self.endTime - self.startTime) * 100. self.equivTime = dt * self.numWorkers equivTime = self.equivTime eqT = [] eff = [] optnum = 1 for i in threads: with PrintKeys(i) as pk: get(dsk, finalTask, num_workers=i) eqT.append(pk.equivTime) print("Efficiency [{}]: {}".format(i, 1 - pk.equivTime / eqT[0] / i)) ef = 1 - pk.equivTime / eqT[0] / i eff.append(ef) if len(eff) > 1: if ef / eff[-2] < 1.1: optnum = i - 1 break print( "Optimal number of concurrent tasks (if possible): {}".format(optnum)) return optnum
def testing_register_pass(): path = os.path.join(os.path.dirname(__file__), 'data', 'registers-sample.csv') registers = processor() workflow = registers.get_workflow(path) report = get(workflow, 'output') results = report.compile() print(json.dumps(report.compile(), indent=True)) assert len(results['tables'][0]['warnings']) == 5 report = [issue for issue in results['tables'][0]['warnings'] if issue['message'].endswith('\'FY\'')].pop() assert report['code'] == 'country-mismatch' assert report['error-data']['mismatch'] == 'FY' assert len(report['context']) == 1 assert report['context'][0]['properties'] == { 'name': 'Venus MacNee', 'nationality': 'Namibian', 'sample_date': '2017-03-01', 'state': 'FY', 'organizations': 12, 'country': 'Finland' } assert len(report['error-data']) == 1 report = [issue for issue in results['tables'][0]['warnings'] if issue['message'].endswith('\'British\'?')].pop() assert report['code'] == 'country-mismatch' assert report['error-data'] == { 'mismatch': 'Britidh', 'guess': ('British', 86) } assert len(report['context']) == 1 assert report['context'][0]['properties'] == { 'country': 'Trinidad', 'nationality': 'Britidh', 'sample_date': '2016-03-23', 'name': 'Cara Matisz', 'state': 'TT', 'organizations': 9 } assert report['item']['entity'] == { 'type': 'Cell', 'definition': None, 'location': { 'row': 3, 'column': 5 } } assert len(results['tables'][0]['informations']) == 3 report = results['tables'][0]['informations'][0] assert report['code'] == 'country-checked' assert report['item']['entity']['type'] == 'Column'
def test_cache(): c = cachey.Cache(10000) cc = Cache(c) with cc: assert get({'x': (inc, 1)}, 'x') == 2 assert flag == [1] assert c.data['x'] == 2 assert not cc.starttimes assert not cc.durations while flag: flag.pop() dsk = {'x': (inc, 1), 'y': (inc, 2), 'z': (add, 'x', 'y')} with cc: assert get(dsk, 'z') == 5 assert flag == [2] # no x present assert not _globals['callbacks']
def test_cache(): c = cachey.Cache(10000) cc = Cache(c) with cc: assert get({"x": (inc, 1)}, "x") == 2 assert flag == [1] assert c.data["x"] == 2 assert not cc.starttimes assert not cc.durations while flag: flag.pop() dsk = {"x": (inc, 1), "y": (inc, 2), "z": (add, "x", "y")} with cc: assert get(dsk, "z") == 5 assert flag == [2] # no x present assert not Callback.active
def test_profiler_plot(): with prof: get(dsk, "e") p = prof.visualize( plot_width=500, plot_height=300, tools="hover", title="Not the default", show=False, save=False, ) assert p.plot_width == 500 assert p.plot_height == 300 assert len(p.tools) == 1 assert isinstance(p.tools[0], bokeh.models.HoverTool) assert p.title.text == "Not the default" # Test empty, checking for errors prof.clear() with pytest.warns(None) as record: prof.visualize(show=False, save=False) assert len(record) == 0
def test_good_on_bad(): path = os.path.join(os.path.dirname(__file__), 'data', 'awful.csv') good = processor() workflow = good.get_workflow(path) results = get(workflow, 'output').compile() errors = results['tables'][0]['errors'] print(errors) assert len(errors) == 7 report = errors[0] print(report) assert report['code'] == 'duplicate-header'
def test_exceptions_propagate(): class MyException(Exception): def __init__(self, a, b): self.a = a self.b = b def __str__(self): return "My Exception!" def f(): raise MyException(1, 2) from dask.threaded import get try: get({"x": (f, )}, "x") assert False except MyException as e: assert "My Exception!" in str(e) assert "a" in dir(e) assert e.a == 1 assert e.b == 2
def test_cache_profiler_plot(): with CacheProfiler(metric_name="non-standard") as cprof: get(dsk, "e") p = cprof.visualize( plot_width=500, plot_height=300, tools="hover", title="Not the default", show=False, save=False, ) assert p.plot_width == 500 assert p.plot_height == 300 assert len(p.tools) == 1 assert isinstance(p.tools[0], bokeh.models.HoverTool) assert p.title.text == "Not the default" assert p.axis[1].axis_label == "Cache Size (non-standard)" # Test empty, checking for errors cprof.clear() with pytest.warns(None) as record: cprof.visualize(show=False, save=False) assert len(record) == 0
def getOrderOfExecution(dsk, finalTask, numConcurrentTasks=1): '''Get the optimal order to minimizing backtracking and memory footprint. numConcurrentTasks can be more than 1 if you think you can do two things at once. Put the final task you want to achieve (one of the keys in depDict)''' from dask.callbacks import Callback #from multiprocessing.pool import ThreadPool from dask.threaded import get #dask.set_options(pool=ThreadPool(numConcurrentTasks)) class PrintKeys(Callback): def __init__(self, numWorkers): self.equivTime = None self.numWorkers = numWorkers def _start(self, dsk): print("Working with {} concurrent tasks".format(self.numWorkers)) self.startTime = clock() def _pretask(self, key, dask, state): """Print the key of every task as it's started""" pass def _posttask(self, key, result, dsk, state, id): print("Do {} <- {}, approx. {} time units".format( repr(key), dsk[key][1:], repr(result))) def _finish(self, dsk, state, errored): self.endTime = clock() dt = (self.endTime - self.startTime) * 100. print("Approximate time to complete: {} time units".format(dt)) print("Equivalent single thread time: {} time units".format( dt * self.numWorkers)) self.equivTime = dt * self.numWorkers with PrintKeys(numConcurrentTasks): get(dsk, finalTask, num_workers=numConcurrentTasks)
def test_nonstandard_exceptions_propagate(): class MyException(Exception): def __init__(self, a, b): self.a = a self.b = b def __str__(self): return "My Exception!" def f(): raise MyException(1, 2) from dask.threaded import get try: get({'x': (f, )}, 'x') assert False except MyException as e: assert "My Exception!" in str(e) assert "Traceback" in str(e) assert 'a' in dir(e) assert 'traceback' in dir(e) assert e.exception.a == 1 and e.exception.b == 2 assert e.a == 1 and e.b == 2