Esempio n. 1
0
def test_eventstream(c, s, *workers):
    pytest.importorskip('bokeh')

    es = EventStream()
    s.add_plugin(es)
    assert es.buffer == []

    futures = c.map(div, [1] * 10, range(10))
    total = c.submit(sum, futures[1:])
    yield wait(total)
    yield wait(futures)

    assert len(es.buffer) == 11

    from distributed.bokeh import messages
    from distributed.diagnostics.progress_stream import task_stream_append
    lists = deepcopy(messages['task-events']['rectangles'])
    workers = dict()
    for msg in es.buffer:
        task_stream_append(lists, msg, workers)

    assert len([n for n in lists['name'] if n.startswith('transfer')]) == 2
    for name, color in zip(lists['name'], lists['color']):
        if name == 'transfer':
            assert color == 'red'

    assert any(c == 'black' for c in lists['color'])
Esempio n. 2
0
def test_eventstream(c, s, *workers):
    pytest.importorskip("bokeh")

    es = EventStream()
    s.add_plugin(es)
    assert es.buffer == []

    futures = c.map(div, [1] * 10, range(10))
    total = c.submit(sum, futures[1:])
    yield wait(total)
    yield wait(futures)

    assert len(es.buffer) == 11

    from distributed.diagnostics.progress_stream import task_stream_append

    lists = {
        name: collections.deque(maxlen=100)
        for name in
        "start duration key name color worker worker_thread y alpha".split()
    }
    workers = dict()
    for msg in es.buffer:
        task_stream_append(lists, msg, workers)

    assert len([n for n in lists["name"] if n.startswith("transfer")]) == 2
    for name, color in zip(lists["name"], lists["color"]):
        if name == "transfer":
            assert color == "red"

    assert any(c == "black" for c in lists["color"])
def test_eventstream(e, s, a, b):
    es = EventStream()
    s.add_plugin(es)
    assert es.buffer == []

    futures = e.map(div, [1] * 10, range(10))
    yield _wait(futures)

    assert len(es.buffer) == 10
Esempio n. 4
0
def bench_search(version, lparam, rstate, refit, occupancy):
    categories = [
        'alt.atheism',
        'talk.religion.misc',
    ]

    # Uncomment the following to do the analysis on all the categories
    # categories = None
    data = fetch_20newsgroups(subset='train',
                              categories=categories,
                              data_home='/data/scikit_learn_data')
    test_data = fetch_20newsgroups(subset='test',
                                   categories=categories,
                                   data_home='/data/scikit_learn_data')

    parameters = [
        ('vect__max_df', (0.5, 0.75, 1.0)),
        ('vect__ngram_range', ((1, 1), (1, 2))),
        ('tfidf__use_idf', (True, False)),
        ('tfidf__norm', ('l1', 'l2')),
        ('clf__alpha', (1e-2, 1e-3, 1e-4, 1e-5)),
        ('clf__n_iter', (10, 50, 80)),
        ('clf__penalty', ('l2', 'elasticnet')),
    ]

    parameter_selection = [0, 3, 1, 4, 2, 5, 6]

    param_grid = dict([parameters[i] for i in parameter_selection[:lparam]])

    refit = {True: 1, False: 0}[refit]

    pipeline = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf', SGDClassifier(random_state=rstate)),
    ])

    if version > 0:
        # , diagnostics_port=None)
        # n_workers=4, threads_per_worker=2)
        cluster = LocalCluster(diagnostics_port=None)
        client = Client(address=cluster.scheduler.address)
        event_stream = EventStream(scheduler=cluster.scheduler)

        if version == 3:
            client.run_on_scheduler(lambda dask_scheduler: dask_scheduler.
                                    add_plugin(CachingPlugin(dask_scheduler)))

    if version == 0:
        search = GridSearchCV(pipeline,
                              param_grid,
                              n_jobs=-1,
                              verbose=1,
                              refit=refit)
    elif version == 1:
        search = dcv.GridSearchCV(pipeline,
                                  param_grid,
                                  scheduler=client,
                                  refit=refit)
    else:  # version == 2 or version == 3:
        search = AsyncGridSearchCV(pipeline,
                                   param_grid,
                                   threshold=1.1,
                                   client=client,
                                   refit=refit,
                                   occupancy_factor=occupancy)

    print("Fitting with {} parameters".format(
        len(ParameterGrid(search.param_grid))))

    start_t = time()
    if version <= 1:
        search.fit(data.data, data.target)
    else:
        search.fit_async(data.data, data.target)

    if refit:
        print("Fit results: {}".format((search.score(data.data, data.target),
                                        search.score(test_data.data,
                                                     test_data.target))))

    fit_duration = (time() - start_t)
    print("Fit took: {}".format(fit_duration))

    results = {
        'start_time': start_t,
        'fit_duration': fit_duration,
        'ncpu': multiprocessing.cpu_count()
    }

    if version > 0:
        # output information about the event-stream
        bf = pd.DataFrame(event_stream.buffer)

        occupancy_series = []
        for i in range(len(bf.worker.unique())):
            ss_async = bf_to_ss(bf[bf.worker == bf.worker.unique()[i]])
            agg_ss = aggregate_ss(ss_async)
            occupancy_series.append(
                (agg_ss.stop - agg_ss.start).sum() /
                (agg_ss.iloc[-1].stop - agg_ss.iloc[0].start))

        print("occupancy:")
        print(pd.Series(occupancy_series).describe())

        # print("transfer-ratio: {}".format())

        # and finally
        client.shutdown()

        results['events'] = bf

    return results