def test_eventstream(c, s, *workers): pytest.importorskip('bokeh') es = EventStream() s.add_plugin(es) assert es.buffer == [] futures = c.map(div, [1] * 10, range(10)) total = c.submit(sum, futures[1:]) yield wait(total) yield wait(futures) assert len(es.buffer) == 11 from distributed.bokeh import messages from distributed.diagnostics.progress_stream import task_stream_append lists = deepcopy(messages['task-events']['rectangles']) workers = dict() for msg in es.buffer: task_stream_append(lists, msg, workers) assert len([n for n in lists['name'] if n.startswith('transfer')]) == 2 for name, color in zip(lists['name'], lists['color']): if name == 'transfer': assert color == 'red' assert any(c == 'black' for c in lists['color'])
def test_eventstream(c, s, *workers): pytest.importorskip("bokeh") es = EventStream() s.add_plugin(es) assert es.buffer == [] futures = c.map(div, [1] * 10, range(10)) total = c.submit(sum, futures[1:]) yield wait(total) yield wait(futures) assert len(es.buffer) == 11 from distributed.diagnostics.progress_stream import task_stream_append lists = { name: collections.deque(maxlen=100) for name in "start duration key name color worker worker_thread y alpha".split() } workers = dict() for msg in es.buffer: task_stream_append(lists, msg, workers) assert len([n for n in lists["name"] if n.startswith("transfer")]) == 2 for name, color in zip(lists["name"], lists["color"]): if name == "transfer": assert color == "red" assert any(c == "black" for c in lists["color"])
def test_eventstream(e, s, a, b): es = EventStream() s.add_plugin(es) assert es.buffer == [] futures = e.map(div, [1] * 10, range(10)) yield _wait(futures) assert len(es.buffer) == 10
def bench_search(version, lparam, rstate, refit, occupancy): categories = [ 'alt.atheism', 'talk.religion.misc', ] # Uncomment the following to do the analysis on all the categories # categories = None data = fetch_20newsgroups(subset='train', categories=categories, data_home='/data/scikit_learn_data') test_data = fetch_20newsgroups(subset='test', categories=categories, data_home='/data/scikit_learn_data') parameters = [ ('vect__max_df', (0.5, 0.75, 1.0)), ('vect__ngram_range', ((1, 1), (1, 2))), ('tfidf__use_idf', (True, False)), ('tfidf__norm', ('l1', 'l2')), ('clf__alpha', (1e-2, 1e-3, 1e-4, 1e-5)), ('clf__n_iter', (10, 50, 80)), ('clf__penalty', ('l2', 'elasticnet')), ] parameter_selection = [0, 3, 1, 4, 2, 5, 6] param_grid = dict([parameters[i] for i in parameter_selection[:lparam]]) refit = {True: 1, False: 0}[refit] pipeline = Pipeline([ ('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', SGDClassifier(random_state=rstate)), ]) if version > 0: # , diagnostics_port=None) # n_workers=4, threads_per_worker=2) cluster = LocalCluster(diagnostics_port=None) client = Client(address=cluster.scheduler.address) event_stream = EventStream(scheduler=cluster.scheduler) if version == 3: client.run_on_scheduler(lambda dask_scheduler: dask_scheduler. add_plugin(CachingPlugin(dask_scheduler))) if version == 0: search = GridSearchCV(pipeline, param_grid, n_jobs=-1, verbose=1, refit=refit) elif version == 1: search = dcv.GridSearchCV(pipeline, param_grid, scheduler=client, refit=refit) else: # version == 2 or version == 3: search = AsyncGridSearchCV(pipeline, param_grid, threshold=1.1, client=client, refit=refit, occupancy_factor=occupancy) print("Fitting with {} parameters".format( len(ParameterGrid(search.param_grid)))) start_t = time() if version <= 1: search.fit(data.data, data.target) else: search.fit_async(data.data, data.target) if refit: print("Fit results: {}".format((search.score(data.data, data.target), search.score(test_data.data, test_data.target)))) fit_duration = (time() - start_t) print("Fit took: {}".format(fit_duration)) results = { 'start_time': start_t, 'fit_duration': fit_duration, 'ncpu': multiprocessing.cpu_count() } if version > 0: # output information about the event-stream bf = pd.DataFrame(event_stream.buffer) occupancy_series = [] for i in range(len(bf.worker.unique())): ss_async = bf_to_ss(bf[bf.worker == bf.worker.unique()[i]]) agg_ss = aggregate_ss(ss_async) occupancy_series.append( (agg_ss.stop - agg_ss.start).sum() / (agg_ss.iloc[-1].stop - agg_ss.iloc[0].start)) print("occupancy:") print(pd.Series(occupancy_series).describe()) # print("transfer-ratio: {}".format()) # and finally client.shutdown() results['events'] = bf return results