Ejemplo n.º 1
0
def test_auto_scatter(loop):  # noqa: F811
    np = pytest.importorskip('numpy')
    data = np.ones(int(1e7), dtype=np.uint8)

    def count_events(event_name, client):
        worker_events = client.run(lambda dask_worker: dask_worker.log)
        event_counts = {}
        for w, events in worker_events.items():
            event_counts[w] = len([event for event in list(events)
                                   if event[1] == event_name])
        return event_counts

    with cluster() as (s, [a, b]):
        with Client(s['address'], loop=loop) as client:
            with parallel_backend('dask') as (ba, _):
                # Passing the same data as arg and kwarg triggers a single
                # scatter operation whose result is reused.
                Parallel()(delayed(noop)(data, data, i, opt=data)
                           for i in range(5))
            # By default large array are automatically scattered with
            # broadcast=1 which means that one worker must directly receive
            # the data from the scatter operation once.
            counts = count_events('receive-from-scatter', client)
            assert counts[a['address']] + counts[b['address']] == 1

    with cluster() as (s, [a, b]):
        with Client(s['address'], loop=loop) as client:
            with parallel_backend('dask') as (ba, _):
                Parallel()(delayed(noop)(data[:3], i) for i in range(5))
            # Small arrays are passed within the task definition without going
            # through a scatter operation.
            counts = count_events('receive-from-scatter', client)
            assert counts[a['address']] == 0
            assert counts[b['address']] == 0
Ejemplo n.º 2
0
def test_local_client(loop):
    def produce(n):
        with local_client() as c:
            x = c.channel('x')
            for i in range(n):
                future = c.submit(slowinc, i, delay=0.01, key='f-%d' % i)
                x.append(future)

            x.flush()

    def consume():
        with local_client() as c:
            x = c.channel('x')
            y = c.channel('y')
            last = 0
            for i, future in enumerate(x):
                last = c.submit(add, future, last, key='add-' + future.key)
                y.append(last)

    with cluster() as (s, [a, b]):
        with Client(('127.0.0.1', s['port']), loop=loop) as c:
            x = c.channel('x')
            y = c.channel('y')

            producers = (c.submit(produce, 5), c.submit(produce, 10))
            consumer = c.submit(consume)

            results = []
            for i, future in enumerate(take(15, y)):
                result = future.result()
                results.append(result)

            assert len(results) == 15
            assert all(0 < r < 100 for r in results)
Ejemplo n.º 3
0
def test_stop(loop):
    def produce(n):
        with local_client() as c:
            x = c.channel('x')
            for i in range(n):
                future = c.submit(slowinc, i, delay=0.01, key='f-%d' % i)
                x.append(future)

            x.stop()
            x.flush()

    with cluster() as (s, [a, b]):
        with Client(('127.0.0.1', s['port']), loop=loop) as c:
            x = c.channel('x')

            producer = c.submit(produce, 5)

            futures = list(x)
            assert len(futures) == 5

            with pytest.raises(StopIteration):
                x.append(c.submit(inc, 1))

            with Client(('127.0.0.1', s['port']), loop=loop) as c2:
                xx = c2.channel('x')
                futures = list(xx)
                assert len(futures) == 5
Ejemplo n.º 4
0
def test_nested_backend_context_manager(loop):  # noqa: F811
    def get_nested_pids():
        pids = set(Parallel(n_jobs=2)(delayed(os.getpid)() for _ in range(2)))
        pids |= set(Parallel(n_jobs=2)(delayed(os.getpid)() for _ in range(2)))
        return pids

    with cluster() as (s, [a, b]):
        with Client(s['address'], loop=loop) as client:
            with parallel_backend('dask') as (ba, _):
                pid_groups = Parallel(n_jobs=2)(
                    delayed(get_nested_pids)()
                    for _ in range(10)
                )
                for pid_group in pid_groups:
                    assert len(set(pid_group)) <= 2

        # No deadlocks
        with Client(s['address'], loop=loop) as client:  # noqa: F841
            with parallel_backend('dask') as (ba, _):
                pid_groups = Parallel(n_jobs=2)(
                    delayed(get_nested_pids)()
                    for _ in range(10)
                )
                for pid_group in pid_groups:
                    assert len(set(pid_group)) <= 2
Ejemplo n.º 5
0
def test_dataframe_groupby_tasks(loop):
    df = pd.util.testing.makeTimeDataFrame()
    df["A"] = df.A // 0.1
    df["B"] = df.B // 0.1
    ddf = dd.from_pandas(df, npartitions=10)
    with cluster() as (c, [a, b]):
        with Client(("127.0.0.1", c["port"]), loop=loop) as c:
            with dask.set_options(get=c.get):
                for ind in [lambda x: "A", lambda x: x.A]:
                    a = df.groupby(ind(df)).apply(len)
                    b = ddf.groupby(ind(ddf)).apply(len)
                    assert_equal(a, b.compute(get=dask.get).sort_index())
                    assert not any("partd" in k[0] for k in b.dask)

                    a = df.groupby(ind(df)).B.apply(len)
                    b = ddf.groupby(ind(ddf)).B.apply(len)
                    assert_equal(a, b.compute(get=dask.get).sort_index())
                    assert not any("partd" in k[0] for k in b.dask)

                with pytest.raises(NotImplementedError):
                    ddf.groupby(ddf[["A", "B"]]).apply(len)

                a = df.groupby(["A", "B"]).apply(len)
                b = ddf.groupby(["A", "B"]).apply(len)

                assert_equal(a, b.compute(get=dask.get).sort_index())
Ejemplo n.º 6
0
def test_dont_assume_function_purity(loop):
    with cluster() as (s, [a, b]):
        with parallel_backend('distributed', loop=loop,
                scheduler_host=('127.0.0.1', s['port'])):

            x, y = Parallel()(delayed(random2)() for i in range(2))
            assert x != y
Ejemplo n.º 7
0
def test_dataframe_groupby_tasks(loop):
    df = pd.util.testing.makeTimeDataFrame()
    df['A'] = df.A // 0.1
    df['B'] = df.B // 0.1
    ddf = dd.from_pandas(df, npartitions=10)
    with cluster() as (c, [a, b]):
        with Executor(('127.0.0.1', c['port']), loop=loop) as e:
            with dask.set_options(get=e.get):
                for ind in [lambda x: 'A', lambda x: x.A]:
                    a = df.groupby(ind(df)).apply(len)
                    b = ddf.groupby(ind(ddf)).apply(len)
                    assert_equal(a, b.compute(get=dask.get).sort_index())
                    assert not any('partd' in k[0] for k in b.dask)

                    a = df.groupby(ind(df)).B.apply(len)
                    b = ddf.groupby(ind(ddf)).B.apply(len)
                    assert_equal(a, b.compute(get=dask.get).sort_index())
                    assert not any('partd' in k[0] for k in b.dask)

                with pytest.raises(NotImplementedError):
                    ddf.groupby(ddf[['A', 'B']]).apply(len)

                a = df.groupby(['A', 'B']).apply(len)
                b = ddf.groupby(['A', 'B']).apply(len)

                assert_equal(a, b.compute(get=dask.get).sort_index())
Ejemplo n.º 8
0
def test_dataframes(loop):
    dfs = [pd.DataFrame({'x': np.random.random(100),
                         'y': np.random.random(100)},
                        index=list(range(i, i + 100)))
           for i in range(0, 100*10, 100)]
    with cluster() as (c, [a, b]):
        with Executor(('127.0.0.1', c['port']), loop=loop) as e:
            remote_dfs = e.map(lambda x: x, dfs)
            rdf = futures_to_dask_dataframe(remote_dfs, divisions=True)
            name = 'foo'
            ldf = dd.DataFrame({(name, i): df for i, df in enumerate(dfs)},
                               name, dfs[0].columns,
                               list(range(0, 1000, 100)) + [999])

            assert rdf.divisions == ldf.divisions
            tm.assert_frame_equal(rdf.compute(get=e.get),
                                  ldf.compute(get=dask.get))

            exprs = [lambda df: df.x.mean(),
                     lambda df: df.y.std(),
                     lambda df: df.assign(z=df.x + df.y).drop_duplicates(),
                     lambda df: df.index,
                     lambda df: df.x,
                     lambda df: df.x.cumsum(),
                     lambda df: df.loc[50:75]]
            for f in exprs:
                local = f(ldf).compute(get=dask.get)
                remote = f(rdf).compute(get=e.get)
                assert_equal(local, remote)
def test_dask_persisted_entityset(entityset, capsys):
    times = list([datetime(2011, 4, 9, 10, 30, i * 6) for i in range(5)] +
                 [datetime(2011, 4, 9, 10, 31, i * 9) for i in range(4)] +
                 [datetime(2011, 4, 9, 10, 40, 0)] +
                 [datetime(2011, 4, 10, 10, 40, i) for i in range(2)] +
                 [datetime(2011, 4, 10, 10, 41, i * 3) for i in range(3)] +
                 [datetime(2011, 4, 10, 11, 10, i * 3) for i in range(2)])
    labels = [False] * 3 + [True] * 2 + [False] * 9 + [True] + [False] * 2
    cutoff_time = pd.DataFrame({'time': times, 'instance_id': range(17)})
    property_feature = IdentityFeature(entityset['log']['value']) > 10

    with cluster() as (scheduler, [a, b]):
        dkwargs = {'cluster': scheduler['address']}
        feature_matrix = calculate_feature_matrix([property_feature],
                                                  entityset=entityset,
                                                  cutoff_time=cutoff_time,
                                                  verbose=True,
                                                  chunk_size=.13,
                                                  dask_kwargs=dkwargs,
                                                  approximate='1 hour')
        assert (feature_matrix == labels).values.all()
        feature_matrix = calculate_feature_matrix([property_feature],
                                                  entityset=entityset,
                                                  cutoff_time=cutoff_time,
                                                  verbose=True,
                                                  chunk_size=.13,
                                                  dask_kwargs=dkwargs,
                                                  approximate='1 hour')
        captured = capsys.readouterr()
        assert "Using EntitySet persisted on the cluster as dataset " in captured[0]
        assert (feature_matrix == labels).values.all()
Ejemplo n.º 10
0
def test_scheduler_param_distributed(loop):
    X, y = make_classification(n_samples=100, n_features=10, random_state=0)
    with cluster() as (s, [a, b]):
        with Client(s['address'], loop=loop, set_as_default=False) as client:
            gs = dcv.GridSearchCV(MockClassifier(), {'foo_param': [0, 1, 2]},
                                  cv=3, scheduler=client)
            gs.fit(X, y)
Ejemplo n.º 11
0
def test_directed_scatter_sync(loop):
    with cluster() as (c, [a, b]):
        with Executor(('127.0.0.1', c['port']), loop=loop) as e:
            e.scatter([1, 2, 3], workers=[('127.0.0.1', b['port'])])
            has_what = sync(e.loop, e.center.has_what)
            assert len(has_what[('127.0.0.1', b['port'])]) == 3
            assert len(has_what[('127.0.0.1', a['port'])]) == 0
Ejemplo n.º 12
0
def test_directed_scatter_sync(loop):
    with cluster() as (s, [a, b]):
        with Executor(('127.0.0.1', s['port']), loop=loop) as e:
            futures = e.scatter([1, 2, 3], workers=[('127.0.0.1', b['port'])])
            has_what = sync(loop, e.scheduler.has_what)
            assert len(has_what[('127.0.0.1', b['port'])]) == 3
            assert len(has_what[('127.0.0.1', a['port'])]) == 0
Ejemplo n.º 13
0
def test_dask_distributed_netcdf_roundtrip(
        loop, tmp_netcdf_filename, engine, nc_format):

    if engine not in ENGINES:
        pytest.skip('engine not available')

    chunks = {'dim1': 4, 'dim2': 3, 'dim3': 6}

    with cluster() as (s, [a, b]):
        with Client(s['address'], loop=loop) as c:

            original = create_test_data().chunk(chunks)

            if engine == 'scipy':
                with pytest.raises(NotImplementedError):
                    original.to_netcdf(tmp_netcdf_filename,
                                       engine=engine, format=nc_format)
                return

            original.to_netcdf(tmp_netcdf_filename,
                               engine=engine, format=nc_format)

            with xr.open_dataset(tmp_netcdf_filename,
                                 chunks=chunks, engine=engine) as restored:
                assert isinstance(restored.var1.data, da.Array)
                computed = restored.compute()
                assert_allclose(original, computed)
Ejemplo n.º 14
0
def test_manual_scatter(loop):  # noqa: F811
    x = CountSerialized(1)
    y = CountSerialized(2)
    z = CountSerialized(3)

    with cluster() as (s, [a, b]):
        with Client(s['address'], loop=loop) as client:  # noqa: F841
            with parallel_backend('dask', scatter=[x, y]) as (ba, _):
                f = delayed(add5)
                tasks = [f(x, y, z, d=4, e=5),
                         f(x, z, y, d=5, e=4),
                         f(y, x, z, d=x, e=5),
                         f(z, z, x, d=z, e=y)]
                expected = [func(*args, **kwargs)
                            for func, args, kwargs in tasks]
                results = Parallel()(tasks)

            # Scatter must take a list/tuple
            with pytest.raises(TypeError):
                with parallel_backend('dask', loop=loop, scatter=1):
                    pass

    assert results == expected

    # Scattered variables only serialized once
    assert x.count == 1
    assert y.count == 1
    assert z.count == 4
Ejemplo n.º 15
0
def test_read_csv_sync(loop):
    import dask.dataframe as dd
    import pandas as pd
    with cluster(nworkers=3) as (s, [a, b, c]):
        with make_hdfs() as (hdfs, basedir):
            with hdfs.open('%s/1.csv' % basedir, 'wb') as f:
                f.write(b'name,amount,id\nAlice,100,1\nBob,200,2')

            with hdfs.open('%s/2.csv' % basedir, 'wb') as f:
                f.write(b'name,amount,id\nCharlie,300,3\nDennis,400,4')

            with Client(('127.0.0.1', s['port']), loop=loop) as e:
                values = dd.read_csv('hdfs://%s/*.csv' % basedir,
                                     lineterminator='\n',
                                     collection=False, header=0)
                futures = e.compute(values)
                assert all(isinstance(f, Future) for f in futures)
                L = e.gather(futures)
                assert isinstance(L[0], pd.DataFrame)
                assert list(L[0].columns) == ['name', 'amount', 'id']

                df = dd.read_csv('hdfs://%s/*.csv' % basedir,
                                 lineterminator='\n',
                                 collection=True, header=0)
                assert isinstance(df, dd.DataFrame)
                assert list(df.head().iloc[0]) == ['Alice', 100, 1]
Ejemplo n.º 16
0
def test_dask_distributed_read_netcdf_integration_test(loop, engine, autoclose,
                                                       nc_format):

    if engine == 'h5netcdf' and autoclose:
        pytest.skip('h5netcdf does not support autoclose')

    if nc_format not in NC_FORMATS[engine]:
        pytest.skip('invalid format for engine')

    chunks = {'dim1': 4, 'dim2': 3, 'dim3': 6}

    with create_tmp_file(allow_cleanup_failure=ON_WINDOWS) as filename:
        with cluster() as (s, [a, b]):
            with Client(s['address'], loop=loop) as c:

                original = create_test_data()
                original.to_netcdf(filename, engine=engine, format=nc_format)

                with xr.open_dataset(filename,
                                     chunks=chunks,
                                     engine=engine,
                                     autoclose=autoclose) as restored:
                    assert isinstance(restored.var1.data, da.Array)
                    computed = restored.compute()
                    assert_allclose(original, computed)
Ejemplo n.º 17
0
def test_gather_after_failed_worker(loop):
    with cluster(active_rpc_timeout=10) as (s, [a, b]):
        with Client(s['address'], loop=loop) as c:
            L = c.map(inc, range(10))
            wait(L)
            a['proc']().terminate()
            result = c.gather(L)
            assert result == list(map(inc, range(10)))
Ejemplo n.º 18
0
def test_submit_after_failed_worker_sync(loop):
    with cluster(active_rpc_timeout=10) as (s, [a, b]):
        with Client(s['address'], loop=loop) as c:
            L = c.map(inc, range(10))
            wait(L)
            a['proc']().terminate()
            total = c.submit(sum, L)
            assert total.result() == sum(map(inc, range(10)))
Ejemplo n.º 19
0
def test_gather_after_failed_worker():
    with cluster() as (c, [a, b]):
        with Executor(('127.0.0.1', c['port'])) as e:
            L = e.map(inc, range(10))
            wait(L)
            a['proc'].terminate()
            result = e.gather(L)
            assert result == list(map(inc, range(10)))
Ejemplo n.º 20
0
def test_submit_after_failed_worker(loop):
    with cluster() as (s, [a, b]):
        with Client(('127.0.0.1', s['port']), loop=loop) as c:
            L = c.map(inc, range(10))
            wait(L)
            a['proc'].terminate()
            total = c.submit(sum, L)
            assert total.result() == sum(map(inc, range(10)))
Ejemplo n.º 21
0
def test_gather_after_failed_worker(loop):
    with cluster() as (s, [a, b]):
        with Client(('127.0.0.1', s['port']), loop=loop) as c:
            L = c.map(inc, range(10))
            wait(L)
            a['proc'].terminate()
            result = c.gather(L)
            assert result == list(map(inc, range(10)))
Ejemplo n.º 22
0
def test_futures_to_delayed_bag(loop):
    db = pytest.importorskip('dask.bag')
    L = [1, 2, 3]
    with cluster() as (s, [a, b]):
        with Client(s['address'], loop=loop) as c:  # flake8: noqa
            futures = c.scatter([L, L])
            b = db.from_delayed(futures)
            assert list(b) == L + L
Ejemplo n.º 23
0
def test_fast(loop):
    with cluster() as (c, [a, b]):
        with Executor(('127.0.0.1', c['port']), loop=loop) as e:
            L = e.map(inc, range(100))
            L2 = e.map(dec, L)
            L3 = e.map(add, L, L2)
            p = progress(L3, multi=True, complete=True, notebook=True)
            assert set(p.all_keys) == {'inc', 'dec', 'add'}
Ejemplo n.º 24
0
def test_stress_gc(loop, func, n):
    with cluster() as (s, [a, b]):
        with Client(s['address'], loop=loop) as c:
            x = c.submit(func, 1)
            for i in range(n):
                x = c.submit(func, x)

            assert x.result() == n + 2
Ejemplo n.º 25
0
def test_stress_gc():
    with cluster() as (c, [a, b]):
        with Executor(('127.0.0.1', c['port']), delete_batch_time=0.5) as e:
            x = e.submit(slowinc, 1)
            for i in range(20):  # this could be increased
                x = e.submit(slowinc, x)

            assert x.result() == 22
Ejemplo n.º 26
0
def test_progress_function(loop, capsys):
    with cluster() as (c, [a, b]):
        with Executor(('127.0.0.1', c['port']), loop=loop) as e:
            f = e.submit(lambda: 1)
            g = e.submit(lambda: 2)

            progress([[f], [[g]]], notebook=False)
            check_bar_completed(capsys)
Ejemplo n.º 27
0
def test_Future_exception_sync(loop):
    with cluster() as (c, [a, b]):
        with Executor(('127.0.0.1', c['port']), loop=loop) as e:
            x = e.submit(div, 1, 0)
            assert isinstance(x.exception(), ZeroDivisionError)

            x = e.submit(div, 1, 1)
            assert x.exception() is None
Ejemplo n.º 28
0
def test_submit_after_failed_worker():
    with cluster() as (c, [a, b]):
        with Executor(('127.0.0.1', c['port'])) as e:
            L = e.map(inc, range(10))
            wait(L)
            a['proc'].terminate()
            total = e.submit(sum, L)
            assert total.result() == sum(map(inc, range(10)))
Ejemplo n.º 29
0
def test_dask_distributed_rasterio_integration_test(loop):
    with create_tmp_geotiff() as (tmp_file, expected):
        with cluster() as (s, [a, b]):
            with Client(s['address'], loop=loop) as c:
                da_tiff = xr.open_rasterio(tmp_file, chunks={'band': 1})
                assert isinstance(da_tiff.data, da.Array)
                actual = da_tiff.compute()
                assert_allclose(actual, expected)
Ejemplo n.º 30
0
def test_read_text_bucket_key_inputs(loop):
    with cluster() as (s, [a, b]):
        with Executor(('127.0.0.1', s['port']), loop=loop) as e:
            a = read_text(test_bucket_name, '/text/accounts', lazy=True)
            b = read_text(test_bucket_name, 'text/accounts', lazy=True)
            c = read_text(test_bucket_name + '/text/accounts', lazy=True)

            assert a._keys() == b._keys() == c._keys()
Ejemplo n.º 31
0
def test_get_worker_name(loop):
    with cluster() as (s, [a, b]):
        with Client(s['address'], loop=loop) as c:

            def f():
                get_client().submit(inc, 1).result()

            c.run(f)

            def func(dask_scheduler):
                return list(dask_scheduler.clients)

            start = time()
            while not any('worker' in n for n in c.run_on_scheduler(func)):
                sleep(0.1)
                assert time() < start + 10
Ejemplo n.º 32
0
def test_as_completed_repeats(loop):
    with cluster() as (s, [a, b]):
        with Client(s['address'], loop=loop) as c:
            ac = as_completed()
            x = c.submit(inc, 1)
            ac.add(x)
            ac.add(x)

            assert next(ac) is x
            assert next(ac) is x

            with pytest.raises(StopIteration):
                next(ac)

            ac.add(x)
            assert next(ac) is x
Ejemplo n.º 33
0
def test_traceback_sync(loop):
    with cluster() as (s, [a, b]):
        with Executor(('127.0.0.1', s['port']), loop=loop) as e:
            x = e.submit(div, 1, 0)
            tb = x.traceback()
            if sys.version_info[0] >= 3:
                assert any('x / y' in line for line in tb)

            y = e.submit(inc, x)
            tb2 = y.traceback()

            assert set(tb2).issuperset(set(tb))

            z = e.submit(div, 1, 2)
            tb = z.traceback()
            assert tb is None
Ejemplo n.º 34
0
def test_regressor(xgboost_loop):  # noqa
    with cluster() as (s, [a, b]):
        with Client(s['address'], loop=xgboost_loop):
            a = dxgb.XGBRegressor()
            X2 = da.from_array(X, 5)
            y2 = da.from_array(y, 5)
            weight1 = da.from_array(weight, 5)
            a.fit(X2, y2, sample_weight=weight1)
            p1 = a.predict(X2)

    b = xgb.XGBRegressor()
    b.fit(X, y, sample_weight=weight)

    np.testing.assert_array_almost_equal(a.feature_importances_,
                                         b.feature_importances_)
    assert_eq(p1, b.predict(X))
Ejemplo n.º 35
0
def test_avro_sync(loop):
    avro_files = {
        '/tmp/test/1.avro': avro_bytes,
        '/tmp/test/2.avro': avro_bytes
    }
    with make_hdfs() as hdfs:
        for k, v in avro_files.items():
            with hdfs.open(k, 'wb') as f:
                f.write(v)

        with cluster(nworkers=1) as (s, [a]):
            with Executor(('127.0.0.1', s['port']), loop=loop) as e:
                futures = read_avro('/tmp/test/*.avro', lazy=False)
                assert all(isinstance(f, Future) for f in futures)
                L = e.gather(futures)
                assert L[0][:5] == data[:5]
Ejemplo n.º 36
0
def test_classifier_local_predict(loop, listen_port): #noqa
    with cluster() as (s, [a, b]):
        with Client(s['address'], loop=loop):
            X, y, w, dX, dy, dw = _create_data('classification', output="array")

            a = dlgbm.LGBMClassifier(local_listen_port=listen_port)
            a = a.fit(dX, dy, sample_weight=dw)
            p1 = a.to_local().predict(dX)

            b = lightgbm.LGBMClassifier()
            b.fit(X, y, sample_weight=w)
            p2 = b.predict(X)

            assert_eq(p1, p2)
            assert_eq(y, p1)
            assert_eq(y, p2)
Ejemplo n.º 37
0
def test_gather_then_submit_after_failed_workers(loop):
    with cluster(nworkers=4) as (s, [w, x, y, z]):
        with Executor(('127.0.0.1', s['port']), loop=loop) as e:
            L = e.map(inc, range(20))
            wait(L)
            w['proc'].terminate()
            total = e.submit(sum, L)
            wait([total])

            (_, port) = first(e.scheduler.who_has[total.key])
            for d in [x, y, z]:
                if d['port'] == port:
                    d['proc'].terminate()

            result = e.gather([total])
            assert result == [sum(map(inc, range(20)))]
Ejemplo n.º 38
0
def test_futures_to_dask_array(loop):
    with cluster() as (s, [a, b]):
        with Client(s['address'], loop=loop) as c:
            remote_arrays = [[c.submit(np.full, (3, 3), i + j)
                                for i in range(3)]
                                for j in range(3)]

            x = futures_to_dask_array(remote_arrays, client=c)
            assert x.chunks == ((3, 3, 3), (3, 3, 3))
            assert x.dtype == np.full((), 0).dtype

            assert x.sum().compute(get=c.get) == 162
            assert (x + x.T).sum().compute(get=c.get) == 162 * 2

            y = futures_to_collection(remote_arrays, client=c)
            assert x.dask == y.dask
Ejemplo n.º 39
0
def test_futures_to_dask_array(loop):
    with cluster() as (c, [a, b]):
        with Executor(('127.0.0.1', c['port']), loop=loop) as e:
            remote_arrays = [[
                e.submit(np.full, (3, 3), i + j) for i in range(3)
            ] for j in range(3)]

            x = futures_to_dask_array(remote_arrays, executor=e)
            assert x.chunks == ((3, 3, 3), (3, 3, 3))
            assert x.dtype == np.full((), 0).dtype

            assert x.sum().compute(get=e.get) == 162
            assert (x + x.T).sum().compute(get=e.get) == 162 * 2

            y = futures_to_collection(remote_arrays, executor=e)
            assert x.dask == y.dask
Ejemplo n.º 40
0
def test_start_ipython_scheduler_magic(loop, zmq_ctx):
    with cluster(1) as (s, [a]):
        with Client(s["address"], loop=loop) as e, mock_ipython() as ip:
            info = e.start_ipython_scheduler()

        expected = [
            {"magic_kind": "line", "magic_name": "scheduler"},
            {"magic_kind": "cell", "magic_name": "scheduler"},
        ]

        call_kwargs_list = [
            kwargs for (args, kwargs) in ip.register_magic_function.call_args_list
        ]
        assert call_kwargs_list == expected
        magic = ip.register_magic_function.call_args_list[0][0][0]
        magic(line="", cell="scheduler")
Ejemplo n.º 41
0
def test_read_text_bucket_key_inputs(loop):
    with cluster() as (s, [a, b]):
        with Executor(('127.0.0.1', s['port']), loop=loop) as e:
            a = read_text(test_bucket_name,
                          '/test/accounts*',
                          lazy=True,
                          anon=True)
            b = read_text(test_bucket_name,
                          'test/accounts*',
                          lazy=True,
                          anon=True)
            c = read_text(test_bucket_name + '/test/accounts*',
                          lazy=True,
                          anon=True)

            assert a._keys() == b._keys() == c._keys()
Ejemplo n.º 42
0
def test_start_ipython_remote(loop, zmq_ctx):
    from distributed._ipython_utils import remote_magic

    with cluster(1) as (s, [a]):
        with Client(s["address"], loop=loop) as e, mock_ipython() as ip:
            worker = first(e.nthreads())
            ip.user_ns["info"] = e.start_ipython_workers(worker)[worker]
            remote_magic("info 1")  # line magic
            remote_magic("info", "worker")  # cell magic

        expected = [
            ((remote_magic,), {"magic_kind": "line", "magic_name": "remote"}),
            ((remote_magic,), {"magic_kind": "cell", "magic_name": "remote"}),
        ]
        assert ip.register_magic_function.call_args_list == expected
        assert ip.register_magic_function.call_count == 2
Ejemplo n.º 43
0
def test_start_ipython_workers(loop, zmq_ctx):
    from jupyter_client import BlockingKernelClient

    with cluster(1) as (s, [a]):
        with Client(s["address"], loop=loop) as e:
            info_dict = e.start_ipython_workers()
            info = first(info_dict.values())
            kc = BlockingKernelClient()
            kc.load_connection_info(info)
            kc.start_channels()
            kc.wait_for_ready(timeout=10)
            msg_id = kc.execute("worker")
            reply = kc.get_shell_msg(timeout=10)
            assert reply["parent_header"]["msg_id"] == msg_id
            assert reply["content"]["status"] == "ok"
            kc.stop_channels()
def test_occupancy(loop):
    with cluster(nanny=True) as (s, [a, b]):
        rm = Occupancy(('127.0.0.1', s['port']), interval=0.01)
        for k in ['host', 'processing', 'waiting']:
            assert k in rm.cds.data

        start = time()
        while not rm.cds.data['host']:
            loop.run_sync(lambda: gen.sleep(0.05))
            assert time() < start + 2

        assert (len(rm.cds.data['host']) == len(rm.cds.data['processing']) ==
                len(rm.cds.data['waiting']) == 2)

        assert isinstance(rm.figure, Figure)
        rm.stream.close()
Ejemplo n.º 45
0
def test_start_ipython_workers_magic(loop, zmq_ctx):
    with cluster(2) as (s, [a, b]):

        with Executor(('127.0.0.1', s['port']),
                      loop=loop) as e, mock_ipython() as ip:
            workers = list(e.ncores())[:2]
            names = ['magic%i' % i for i in range(len(workers))]
            info_dict = e.start_ipython_workers(workers, magic_names=names)

        expected = [
            {
                'magic_kind': 'line',
                'magic_name': 'remote'
            },
            {
                'magic_kind': 'cell',
                'magic_name': 'remote'
            },
            {
                'magic_kind': 'line',
                'magic_name': 'magic0'
            },
            {
                'magic_kind': 'cell',
                'magic_name': 'magic0'
            },
            {
                'magic_kind': 'line',
                'magic_name': 'magic1'
            },
            {
                'magic_kind': 'cell',
                'magic_name': 'magic1'
            },
        ]
        call_kwargs_list = [
            kwargs
            for (args, kwargs) in ip.register_magic_function.call_args_list
        ]
        assert call_kwargs_list == expected
        assert ip.register_magic_function.call_count == 6
        magics = [
            args[0][0]
            for args in ip.register_magic_function.call_args_list[2:]
        ]
        magics[-1](line="", cell="worker")
        [m.client.stop_channels() for m in magics]
Ejemplo n.º 46
0
def test_start_ipython_workers(loop, zmq_ctx):
    from jupyter_client import BlockingKernelClient

    with cluster(1) as (s, [a]):
        with Executor(('127.0.0.1', s['port']), loop=loop) as e:
            info_dict = e.start_ipython_workers()
            info = first(info_dict.values())
            key = info.pop('key')
            kc = BlockingKernelClient(**info)
            kc.session.key = key
            kc.start_channels()
            kc.wait_for_ready(timeout=10)
            msg_id = kc.execute("worker")
            reply = kc.get_shell_msg(timeout=10)
            assert reply['parent_header']['msg_id'] == msg_id
            assert reply['content']['status'] == 'ok'
            kc.stop_channels()
Ejemplo n.º 47
0
def test_distributed_persist(loop, dask_array):
    """Test persist() for distributed machines."""
    q = ureg.Quantity(dask_array, units_)

    with cluster() as (s, [a, b]):
        with Client(s["address"], loop=loop):
            comps = add_five(q)
            persisted_q = comps.persist()

            comps_truth = dask_array + 5
            persisted_truth = comps_truth.persist()

            assert np.all(persisted_q.m == persisted_truth)
            assert dask.is_dask_collection(persisted_q)
            assert persisted_q.units == units_

    assert q.magnitude is dask_array
Ejemplo n.º 48
0
def test_classifier_proba(loop, output, listen_port, centers):
    with cluster() as (s, [a, b]):
        with Client(s['address'], loop=loop) as client:
            X, y, w, dX, dy, dw = _create_data('classification',
                                               output=output,
                                               centers=centers)

            a = dlgbm.LGBMClassifier(local_listen_port=listen_port)
            a = a.fit(dX, dy, sample_weight=dw)
            p1 = a.predict_proba(dX, client=client)
            p1 = p1.compute()

            b = lightgbm.LGBMClassifier()
            b.fit(X, y, sample_weight=w)
            p2 = b.predict_proba(X)

            assert_eq(p1, p2, atol=0.3)
Ejemplo n.º 49
0
def test_restart_sync(loop):
    with cluster(nanny=True) as (s, [a, b]):
        with Client(s["address"], loop=loop) as c:
            x = c.submit(div, 1, 2)
            x.result()

            assert sync(loop, c.scheduler.who_has)
            c.restart()
            assert not sync(loop, c.scheduler.who_has)
            assert x.cancelled()
            assert len(c.nthreads()) == 2

            with pytest.raises(CancelledError):
                x.result()

            y = c.submit(div, 1, 3)
            assert y.result() == 1 / 3
Ejemplo n.º 50
0
def test_start_ipython_workers_magic(loop, zmq_ctx):
    with cluster(2) as (s, [a, b]):

        with Client(s["address"], loop=loop) as e, mock_ipython() as ip:
            workers = list(e.nthreads())[:2]
            names = ["magic%i" % i for i in range(len(workers))]
            info_dict = e.start_ipython_workers(workers, magic_names=names)

        expected = [
            {
                "magic_kind": "line",
                "magic_name": "remote"
            },
            {
                "magic_kind": "cell",
                "magic_name": "remote"
            },
            {
                "magic_kind": "line",
                "magic_name": "magic0"
            },
            {
                "magic_kind": "cell",
                "magic_name": "magic0"
            },
            {
                "magic_kind": "line",
                "magic_name": "magic1"
            },
            {
                "magic_kind": "cell",
                "magic_name": "magic1"
            },
        ]
        call_kwargs_list = [
            kwargs
            for (args, kwargs) in ip.register_magic_function.call_args_list
        ]
        assert call_kwargs_list == expected
        assert ip.register_magic_function.call_count == 6
        magics = [
            args[0][0]
            for args in ip.register_magic_function.call_args_list[2:]
        ]
        magics[-1](line="", cell="worker")
        [m.client.stop_channels() for m in magics]
Ejemplo n.º 51
0
def test_futures_to_delayed_array(loop):
    da = pytest.importorskip("dask.array")
    from dask.array.utils import assert_eq

    np = pytest.importorskip("numpy")
    x = np.arange(5)
    with cluster() as (s, [a, b]):
        with Client(s["address"], loop=loop) as c:
            futures = c.scatter([x, x])
            A = da.concatenate(
                [
                    da.from_delayed(f, shape=x.shape, dtype=x.dtype)
                    for f in futures
                ],
                axis=0,
            )
            assert_eq(A.compute(), np.concatenate([x, x], axis=0))
Ejemplo n.º 52
0
def test_values(loop):
    with cluster() as (s, [a, b]):
        with Client(s['address'], loop=loop) as c:
            L = [c.submit(inc, i) for i in range(5)]
            wait(L)
            p = MultiProgressWidget(L)
            sync(loop, p.listen)
            assert set(p.bars) == {'inc'}
            assert p.status == 'finished'
            assert p.comm.closed()
            assert '5 / 5' in p.bar_texts['inc'].value
            assert p.bars['inc'].value == 1.0

            x = c.submit(throws, 1)
            p = MultiProgressWidget([x])
            sync(loop, p.listen)
            assert p.status == 'error'
Ejemplo n.º 53
0
def test_regressor_local_predict(loop, listen_port):
    with cluster() as (s, [a, b]):
        with Client(s['address'], loop=loop):
            X, y, w, dX, dy, dw = _create_data('regression', output="array")

            a = dlgbm.LGBMRegressor(local_listen_port=listen_port, seed=42)
            a = a.fit(dX, dy, sample_weight=dw)
            p1 = a.predict(dX)
            p2 = a.to_local().predict(X)
            s1 = r2_score(dy, p1)
            p1 = p1.compute()
            s2 = a.to_local().score(X, y)
            print(s1)

            # Predictions and scores should be the same
            assert_eq(p1, p2)
            np.isclose(s1, s2)
Ejemplo n.º 54
0
def test_simple(loop, joblib):
    if joblib is None:
        pytest.skip()
    Parallel = joblib.Parallel
    delayed = joblib.delayed
    with cluster() as (s, [a, b]):
        with Client(s['address'], loop=loop) as client:
            with joblib.parallel_backend('dask') as (ba, _):
                seq = Parallel()(delayed(inc)(i) for i in range(10))
                assert seq == [inc(i) for i in range(10)]

                with pytest.raises(ValueError):
                    Parallel()(delayed(slow_raise_value_error)(i == 3)
                               for i in range(10))

                seq = Parallel()(delayed(inc)(i) for i in range(10))
                assert seq == [inc(i) for i in range(10)]
Ejemplo n.º 55
0
def test_read_csv_sync_compute(loop):
    import dask.dataframe as dd
    import pandas as pd
    with cluster(nworkers=1) as (s, [a]):
        with make_hdfs() as hdfs:
            with hdfs.open('/tmp/test/1.csv', 'wb') as f:
                f.write(b'name,amount,id\nAlice,100,1\nBob,200,2')

            with hdfs.open('/tmp/test/2.csv', 'wb') as f:
                f.write(b'name,amount,id\nCharlie,300,3\nDennis,400,4')

            with Executor(('127.0.0.1', s['port']), loop=loop) as e:
                for lazy in [True, False]:
                    df = read_csv('/tmp/test/*.csv',
                                  collection=True,
                                  lazy=lazy)
                    assert df.amount.sum().compute(get=e.get) == 1000
Ejemplo n.º 56
0
def test_values(loop):
    with cluster() as (s, [a, b]):
        with Executor(('127.0.0.1', s['port']), loop=loop) as e:
            L = [e.submit(inc, i) for i in range(5)]
            wait(L)
            p = MultiProgressWidget(L)
            sync(loop, p.listen)
            assert set(p.bars) == {'inc'}
            assert p.status == 'finished'
            assert p.stream.closed()
            assert '5 / 5' in p.bar_texts['inc'].value
            assert p.bars['inc'].value == 1.0

            x = e.submit(throws, 1)
            p = MultiProgressWidget([x])
            sync(loop, p.listen)
            assert p.status == 'error'
Ejemplo n.º 57
0
def test_restart_sync(loop):
    with cluster(nanny=True) as (s, [a, b]):
        with Executor(('127.0.0.1', s['port']), loop=loop) as e:
            x = e.submit(div, 1, 2)
            x.result()

            assert sync(loop, e.scheduler.who_has)
            e.restart()
            assert not sync(loop, e.scheduler.who_has)
            assert x.cancelled()
            assert len(e.ncores()) == 2

            with pytest.raises(CancelledError):
                x.result()

            y = e.submit(div, 1, 3)
            assert y.result() == 1 / 3
Ejemplo n.º 58
0
def test_text_progressbar(capsys, loop):
    with cluster(nanny=True) as (s, [a, b]):
        with Client(('127.0.0.1', s['port']), loop=loop) as c:
            futures = c.map(inc, range(10))
            p = TextProgressBar(futures, interval=0.01, complete=True)
            c.gather(futures)

            start = time()
            while p.status != 'finished':
                sleep(0.01)
                assert time() - start < 5

            check_bar_completed(capsys)
            assert p._last_response == {'all': 10,
                                        'remaining': 0,
                                        'status': 'finished'}
            assert p.stream.closed()
Ejemplo n.º 59
0
def test_sync(loop):
    with cluster() as (s, [a, b]):
        with Client(s['address'], loop=loop) as c:
            counter = c.submit(Counter, actor=True)
            counter = counter.result()

            assert counter.n == 0

            future = counter.increment()
            n = future.result()
            assert n == 1
            assert counter.n == 1

            assert future.result() == future.result()

            assert 'ActorFuture' in repr(future)
            assert 'distributed.actor' not in repr(future)
Ejemplo n.º 60
0
def test_as_completed_distributed(loop):  # noqa
    cluster_kwargs = dict(active_rpc_timeout=10, nanny=Nanny)
    if DISTRIBUTED_2_11_0:
        cluster_kwargs["disconnect_timeout"] = 10
    with cluster(**cluster_kwargs) as (s, [a, b]):
        with Client(s["address"], loop=loop) as c:
            counter_name = "counter_name"
            counter = Variable(counter_name, client=c)
            counter.set(0)
            lock_name = "lock"

            killed_workers_name = "killed_workers"
            killed_workers = Variable(killed_workers_name, client=c)
            killed_workers.set({})

            X, y = make_classification(n_samples=100,
                                       n_features=10,
                                       random_state=0)
            gs = dcv.GridSearchCV(
                AsCompletedEstimator(killed_workers_name, lock_name,
                                     counter_name, 7),
                param_grid={"foo_param": [0, 1, 2]},
                cv=3,
                refit=False,
                cache_cv=False,
                scheduler=c,
            )
            gs.fit(X, y)

            def f(dask_scheduler):
                return dask_scheduler.transition_log

            def check_reprocess(transition_log):
                finished = set()
                for transition in transition_log:
                    key, start_state, end_state = (
                        transition[0],
                        transition[1],
                        transition[2],
                    )
                    assert key not in finished
                    if ("score" in key and start_state == "memory"
                            and end_state == "forgotten"):
                        finished.add(key)

            check_reprocess(c.run_on_scheduler(f))