def test_dataframe_object_dtype(): pd = pytest.importorskip('pandas') df = pd.DataFrame({'x': ['a'] * 1000}) assert sizeof('a') * 1000 < sizeof(df) < 2 * sizeof('a') * 1000 s = pd.Series(['a' * 1000] * 1000) assert sizeof(s) > 1000000
def test_serires_object_dtype(): pd = pytest.importorskip('pandas') s = pd.Series(['a'] * 1000) assert sizeof('a') * 1000 < sizeof(s) < 2 * sizeof('a') * 1000 s = pd.Series(['a' * 1000] * 1000) assert sizeof(s) > 1000000
def f(c, a, b): e = Executor((c.ip, c.port), start=False, loop=loop) yield e._start() d = yield e._scatter({'y': 20}) assert isinstance(d['y'], Future) assert a.data.get('y') == 20 or b.data.get('y') == 20 assert (a.address in e.scheduler.who_has['y'] or b.address in e.scheduler.who_has['y']) assert c.who_has['y'] assert e.scheduler.nbytes == {'y': sizeof(20)} yy = yield e._gather([d['y']]) assert yy == [20] [x] = yield e._scatter([10]) assert isinstance(x, Future) assert a.data.get(x.key) == 10 or b.data.get(x.key) == 10 xx = yield e._gather([x]) assert c.who_has[x.key] assert (a.address in e.scheduler.who_has[x.key] or b.address in e.scheduler.who_has[x.key]) assert e.scheduler.nbytes == {'y': sizeof(20), x.key: sizeof(10)} assert xx == [10] z = e.submit(add, x, d['y']) # submit works on RemoteData result = yield z._result() assert result == 10 + 20 result = yield e._gather([z, x]) assert result == [30, 10] yield e._shutdown()
def register_geodataframe(df): # TODO: sample wkb if df._geometry_column_name in df.columns: return sizeof(df.drop(df._geometry_column_name, axis=1)) + len(df) * 100 else: return sizeof(pd.DataFrame(df))
def test__scatter(s, a, b): e = Executor((s.ip, s.port), start=False) yield e._start() d = yield e._scatter({'y': 20}) assert isinstance(d['y'], Future) assert a.data.get('y') == 20 or b.data.get('y') == 20 assert (a.address in s.who_has['y'] or b.address in s.who_has['y']) assert s.who_has['y'] assert s.nbytes == {'y': sizeof(20)} yy = yield e._gather([d['y']]) assert yy == [20] [x] = yield e._scatter([10]) assert isinstance(x, Future) assert a.data.get(x.key) == 10 or b.data.get(x.key) == 10 xx = yield e._gather([x]) assert s.who_has[x.key] assert (a.address in s.who_has[x.key] or b.address in s.who_has[x.key]) assert s.nbytes == {'y': sizeof(20), x.key: sizeof(10)} assert xx == [10] z = e.submit(add, x, d['y']) # submit works on RemoteData result = yield z._result() assert result == 10 + 20 result = yield e._gather([z, x]) assert result == [30, 10] yield e._shutdown()
def test_empty(): pd = pytest.importorskip('pandas') df = pd.DataFrame({'x': [1, 2, 3], 'y': ['a'*100, 'b'*100, 'c'*100]}, index=[10, 20, 30]) empty = df.head(0) assert sizeof(empty) > 0 assert sizeof(empty.x) > 0 assert sizeof(empty.y) > 0 assert sizeof(empty.index) > 0
def test_nbytes(s, a, b): e = Executor((s.ip, s.port), start=False) yield e._start() [x] = yield e._scatter([1]) assert s.nbytes == {x.key: sizeof(1)} y = e.submit(inc, x) yield y._result() assert s.nbytes == {x.key: sizeof(1), y.key: sizeof(2)} yield e._shutdown()
def f(c, a, b): e = Executor((c.ip, c.port), start=False, loop=loop) yield e._start() [x] = yield e._scatter([1]) assert e.scheduler.nbytes == {x.key: sizeof(1)} y = e.submit(inc, x) yield y._result() assert e.scheduler.nbytes == {x.key: sizeof(1), y.key: sizeof(2)} yield e._shutdown()
def test_nbytes(s, a, b): server = HTTPWorker(a) server.listen(0) client = AsyncHTTPClient() a.data['x-1'] = 1 a.data['x-2'] = 1 nbytes = yield client.fetch('http://localhost:%d/nbytes.json' % server.port) nbytes = json.loads(nbytes.body.decode()) summary = yield client.fetch('http://localhost:%d/nbytes-summary.json' % server.port) summary = json.loads(summary.body.decode()) assert nbytes == {'x-1': sizeof(1), 'x-2': sizeof(2)} assert summary == {'x': sizeof(1) * 2}
def f(c, a, b): aa = rpc(ip=a.ip, port=a.port) response, content = yield aa.update_data(data={'x': 1, 'y': 2}) assert response == b'OK' assert content['nbytes'] == {'x': sizeof(1), 'y': sizeof(2)} assert a.data == {'x': 1, 'y': 2} assert c.who_has == {'x': {(a.ip, a.port)}, 'y': {(a.ip, a.port)}} assert c.has_what[(a.ip, a.port)] == {'x', 'y'} yield aa.delete_data(keys=['x'], close=True) assert not c.who_has['x'] assert all('x' not in s for s in c.has_what.values()) aa.close_streams()
def f(c, a, b): aa = rpc(ip=a.ip, port=a.port) response, content = yield aa.update_data(data={"x": 1, "y": 2}) assert response == b"OK" assert content["nbytes"] == {"x": sizeof(1), "y": sizeof(2)} assert a.data == {"x": 1, "y": 2} assert c.who_has == {"x": {(a.ip, a.port)}, "y": {(a.ip, a.port)}} assert c.has_what[(a.ip, a.port)] == {"x", "y"} yield aa.delete_data(keys=["x"], close=True) assert not c.who_has["x"] assert all("x" not in s for s in c.has_what.values()) aa.close_streams()
def maybe_to_futures(args): for arg in args: arg_id = id(arg) if arg_id in itemgetters: yield itemgetters[arg_id] continue f = self.data_futures.get(arg_id, None) if f is None and call_data_futures is not None: try: f = call_data_futures[arg] except KeyError: if is_weakrefable(arg) and sizeof(arg) > 1e3: # Automatically scatter large objects to some of # the workers to avoid duplicated data transfers. # Rely on automated inter-worker data stealing if # more workers need to reuse this data # concurrently. [f] = self.client.scatter([arg]) call_data_futures[arg] = f if f is not None: getter = itemgetter(len(collected_futures)) collected_futures.append(f) itemgetters[arg_id] = getter arg = getter yield arg
def dont_test_workers_update_center(s, a, b): aa = rpc(ip=a.ip, port=a.port) response = yield aa.update_data(data={'x': dumps(1), 'y': dumps(2)}) assert response['status'] == 'OK' assert response['nbytes'] == {'x': sizeof(1), 'y': sizeof(2)} assert a.data == {'x': 1, 'y': 2} assert s.who_has == {'x': {a.address}, 'y': {a.address}} assert s.has_what[a.address] == {'x', 'y'} yield aa.delete_data(keys=['x'], close=True) assert not s.who_has['x'] assert all('x' not in s for s in c.has_what.values()) aa.close_rpc()
def device_host_file_size_matches(dhf, total_bytes, device_chunk_overhead=0, serialized_chunk_overhead=1024): byte_sum = dhf.device_buffer.fast.total_weight # `dhf.host_buffer.fast` is only available when Worker's `memory_limit != 0` if hasattr(dhf.host_buffer, "fast"): byte_sum += dhf.host_buffer.fast.total_weight else: byte_sum += sum([sizeof(b) for b in dhf.host_buffer.values()]) # `dhf.disk` is only available when Worker's `memory_limit != 0` if dhf.disk is not None: file_path = [ os.path.join(dhf.disk.directory, safe_key(k)) for k in dhf.disk.keys() ] file_size = [os.path.getsize(f) for f in file_path] byte_sum += sum(file_size) # Allow up to chunk_overhead bytes overhead per chunk device_overhead = len(dhf.device) * device_chunk_overhead host_overhead = len(dhf.host) * serialized_chunk_overhead disk_overhead = (len(dhf.disk) * serialized_chunk_overhead if dhf.disk is not None else 0) return (byte_sum >= total_bytes and byte_sum <= total_bytes + device_overhead + host_overhead + disk_overhead)
def f(c, a, b): aa = rpc(ip=a.ip, port=a.port) bb = rpc(ip=b.ip, port=b.port) result = yield aa.identity() assert not a.active response = yield aa.compute(key='x', function=dumps(add), args=dumps([1, 2]), who_has={}, close=True) assert not a.active assert response['status'] == 'OK' assert a.data['x'] == 3 assert c.who_has['x'] == {a.address} assert isinstance(response['compute-start'], float) assert isinstance(response['compute-stop'], float) assert isinstance(response['thread'], int) response = yield bb.compute(key='y', function=dumps(add), args=dumps(['x', 10]), who_has={'x': [a.address]}) assert response['status'] == 'OK' assert b.data['y'] == 13 assert c.who_has['y'] == {b.address} assert response['nbytes'] == sizeof(b.data['y']) assert isinstance(response['transfer-start'], float) assert isinstance(response['transfer-stop'], float) def bad_func(): 1 / 0 response = yield bb.compute(key='z', function=dumps(bad_func), args=dumps(()), close=True) assert not b.active assert response['status'] == 'error' assert isinstance(loads(response['exception']), ZeroDivisionError) if sys.version_info[0] >= 3: assert any('1 / 0' in line for line in pluck(3, traceback.extract_tb( loads(response['traceback']))) if line) aa.close_streams() yield a._close() assert a.address not in c.ncores and b.address in c.ncores assert list(c.ncores.keys()) == [b.address] assert isinstance(b.address, str) assert b.ip in b.address assert str(b.port) in b.address bb.close_streams() yield b._close()
def dont_test_workers_update_center(s, a, b): aa = rpc(ip=a.ip, port=a.port) response = yield aa.update_data(data={'x': dumps(1), 'y': dumps(2)}) assert response['status'] == 'OK' assert response['nbytes'] == {'x': sizeof(1), 'y': sizeof(2)} assert a.data == {'x': 1, 'y': 2} assert s.who_has == {'x': {a.address}, 'y': {a.address}} assert s.has_what[a.address] == {'x', 'y'} yield aa.delete_data(keys=['x'], close=True) assert not s.who_has['x'] assert all('x' not in s for s in c.has_what.values()) aa.close_streams()
def test_pandas(): pd = pytest.importorskip('pandas') df = pd.DataFrame({'x': [1, 2, 3], 'y': ['a'*1000, 'b'*1000, 'c'*1000]}, index=[10, 20, 30]) assert sizeof(df) >= sizeof(df.x) + sizeof(df.y) - sizeof(df.index) assert sizeof(df.x) >= sizeof(df.index) if pd.__version__ >= '0.17.1': assert sizeof(df.y) >= 1000 * 3 assert sizeof(df.index) >= 20
def f(c, a, b): aa = rpc(ip=a.ip, port=a.port) bb = rpc(ip=b.ip, port=b.port) result = yield aa.identity() assert not a.active response = yield aa.compute(key='x', function=dumps(add), args=dumps([1, 2]), who_has={}, close=True) assert not a.active assert response['status'] == 'OK' assert a.data['x'] == 3 assert c.who_has['x'] == {a.address} assert isinstance(response['compute-start'], float) assert isinstance(response['compute-stop'], float) assert isinstance(response['thread'], int) response = yield bb.compute(key='y', function=dumps(add), args=dumps(['x', 10]), who_has={'x': [a.address]}) assert response['status'] == 'OK' assert b.data['y'] == 13 assert c.who_has['y'] == {b.address} assert response['nbytes'] == sizeof(b.data['y']) assert isinstance(response['transfer-start'], float) assert isinstance(response['transfer-stop'], float) def bad_func(): 1 / 0 response = yield bb.compute(key='z', function=dumps(bad_func), args=dumps(()), close=True) assert not b.active assert response['status'] == 'error' assert isinstance(loads(response['exception']), ZeroDivisionError) if sys.version_info[0] >= 3: assert any('1 / 0' in line for line in pluck( 3, traceback.extract_tb(loads(response['traceback']))) if line) aa.close_streams() yield a._close() assert a.address not in c.ncores and b.address in c.ncores assert list(c.ncores.keys()) == [b.address] assert isinstance(b.address, str) assert b.ip in b.address assert str(b.port) in b.address bb.close_streams() yield b._close()
def test_sparse_matrix(): sparse = pytest.importorskip('scipy.sparse') sp = sparse.eye(10) assert sizeof(sp.todia()) >= 152 assert sizeof(sp.tobsr()) >= 232 assert sizeof(sp.tocoo()) >= 252 assert sizeof(sp.tocsc()) >= 232 assert sizeof(sp.tocsr()) >= 260 assert sizeof(sp.todok()) >= 260 assert sizeof(sp.tolil()) >= 324
def test_pandas(): pd = pytest.importorskip('pandas') df = pd.DataFrame( { 'x': [1, 2, 3], 'y': ['a' * 1000, 'b' * 1000, 'c' * 1000] }, index=[10, 20, 30]) assert sizeof(df) >= sizeof(df.x) + sizeof(df.y) - sizeof(df.index) assert sizeof(df.x) >= sizeof(df.index) if pd.__version__ >= '0.17.1': assert sizeof(df.y) >= 1000 * 3 assert sizeof(df.index) >= 20
def f(c, a, b): aa = rpc(ip=a.ip, port=a.port) bb = rpc(ip=b.ip, port=b.port) assert not a.active response, _ = yield aa.compute(key='x', function=add, args=[1, 2], needed=[], close=True) assert not a.active assert response == b'OK' assert a.data['x'] == 3 assert c.who_has['x'] == set([(a.ip, a.port)]) response, info = yield bb.compute(key='y', function=add, args=['x', 10], needed=['x']) assert response == b'OK' assert b.data['y'] == 13 assert c.who_has['y'] == set([(b.ip, b.port)]) assert info['nbytes'] == sizeof(b.data['y']) def bad_func(): 1 / 0 response, content = yield bb.compute(key='z', function=bad_func, args=(), needed=(), close=True) assert not b.active assert response == b'error' assert isinstance(content['exception'], ZeroDivisionError) if sys.version_info[0] >= 3: assert any('1 / 0' in line for line in pluck( 3, traceback.extract_tb(content['traceback'])) if line) aa.close_streams() yield a._close() assert a.address not in c.ncores and b.address in c.ncores assert list(c.ncores.keys()) == [(b.ip, b.port)] assert isinstance(b.address_string, str) assert b.ip in b.address_string assert str(b.port) in b.address_string bb.close_streams() yield b._close()
def test_sparse_matrix(): sparse = pytest.importorskip('scipy.sparse') sp = sparse.eye(10) # These are the 32-bit Python 2.7 values. assert sizeof(sp.todia()) >= 152 assert sizeof(sp.tobsr()) >= 232 assert sizeof(sp.tocoo()) >= 240 assert sizeof(sp.tocsc()) >= 232 assert sizeof(sp.tocsr()) >= 232 assert sizeof(sp.todok()) >= 192 assert sizeof(sp.tolil()) >= 204
def f(c, a, b): aa = rpc(ip=a.ip, port=a.port) bb = rpc(ip=b.ip, port=b.port) assert not a.active response, _ = yield aa.compute(key='x', function=add, args=[1, 2], who_has={}, close=True) assert not a.active assert response == b'OK' assert a.data['x'] == 3 assert c.who_has['x'] == set([(a.ip, a.port)]) response, info = yield bb.compute(key='y', function=add, args=['x', 10], who_has={'x': {a.address}}) assert response == b'OK' assert b.data['y'] == 13 assert c.who_has['y'] == set([(b.ip, b.port)]) assert info['nbytes'] == sizeof(b.data['y']) def bad_func(): 1 / 0 response, content = yield bb.compute(key='z', function=bad_func, args=(), close=True) assert not b.active assert response == b'error' assert isinstance(content['exception'], ZeroDivisionError) if sys.version_info[0] >= 3: assert any('1 / 0' in line for line in pluck(3, traceback.extract_tb(content['traceback'])) if line) aa.close_streams() yield a._close() assert a.address not in c.ncores and b.address in c.ncores assert list(c.ncores.keys()) == [(b.ip, b.port)] assert isinstance(b.address_string, str) assert b.ip in b.address_string assert str(b.port) in b.address_string bb.close_streams() yield b._close()
def f(c, a, b): aa = rpc(ip=a.ip, port=a.port) bb = rpc(ip=b.ip, port=b.port) response, _ = yield aa.compute(key='x', function=add, args=[1, 2], needed=[], close=True) assert response == b'OK' assert a.data['x'] == 3 assert c.who_has['x'] == set([(a.ip, a.port)]) response, info = yield bb.compute(key='y', function=add, args=['x', 10], needed=['x']) assert response == b'OK' assert b.data['y'] == 13 assert c.who_has['y'] == set([(b.ip, b.port)]) assert info['nbytes'] == sizeof(b.data['y']) def bad_func(): 1 / 0 response, (error, traceback) = yield bb.compute(key='z', function=bad_func, args=(), needed=(), close=True) assert response == b'error' assert isinstance(error, ZeroDivisionError) if sys.version_info[0] >= 3: assert any('1 / 0' in line for line in traceback) aa.close_streams() yield a._close() assert a.address not in c.ncores and b.address in c.ncores assert list(c.ncores.keys()) == [(b.ip, b.port)] bb.close_streams() yield b._close()
def f(c, a, b): aa = rpc(ip=a.ip, port=a.port) bb = rpc(ip=b.ip, port=b.port) assert not a.active response, _ = yield aa.compute(key="x", function=add, args=[1, 2], needed=[], close=True) assert not a.active assert response == b"OK" assert a.data["x"] == 3 assert c.who_has["x"] == set([(a.ip, a.port)]) response, info = yield bb.compute(key="y", function=add, args=["x", 10], needed=["x"]) assert response == b"OK" assert b.data["y"] == 13 assert c.who_has["y"] == set([(b.ip, b.port)]) assert info["nbytes"] == sizeof(b.data["y"]) def bad_func(): 1 / 0 response, (error, traceback) = yield bb.compute(key="z", function=bad_func, args=(), needed=(), close=True) assert not b.active assert response == b"error" assert isinstance(error, ZeroDivisionError) if sys.version_info[0] >= 3: assert any("1 / 0" in line for line in traceback) aa.close_streams() yield a._close() assert a.address not in c.ncores and b.address in c.ncores assert list(c.ncores.keys()) == [(b.ip, b.port)] bb.close_streams() yield b._close()
def test_pandas(): pd = pytest.importorskip("pandas") df = pd.DataFrame({"x": [1, 2, 3], "y": ["a" * 1000, "b" * 1000, "c" * 1000]}, index=[10, 20, 30]) assert sizeof(df) >= sizeof(df.x) + sizeof(df.y) - sizeof(df.index) assert sizeof(df.x) >= sizeof(df.index) if pd.__version__ >= "0.17.1": assert sizeof(df.y) >= 1000 * 3 assert sizeof(df.index) >= 20 assert isinstance(sizeof(df), int) assert isinstance(sizeof(df.x), int) assert isinstance(sizeof(df.index), int)
def test_containers(): assert sizeof([1, 2, [3]]) > (getsizeof(3) * 3 + getsizeof([]))
def test_numpy(): np = pytest.importorskip("numpy") assert sizeof(np.empty(1000, dtype="f8")) >= 8000
def test_numpy(): np = pytest.importorskip('numpy') assert sizeof(np.empty(1000, dtype='f8')) >= 8000
def test_numpy(): np = pytest.importorskip('numpy') assert sizeof(np.empty(1000, dtype='f8')) == 8000 dt = np.dtype('f8') assert sizeof(dt) == sys.getsizeof(dt)
def test_base(): assert sizeof(1) == getsizeof(1)
def test_sizeof(): assert sizeof(grid_df) > sizeof(grid_df[['x', 'y']])
def test_pandas_repeated_column(): pd = pytest.importorskip('pandas') df = pd.DataFrame({'x': [1, 2, 3]}) assert sizeof(df[['x', 'x', 'x']]) > sizeof(df)