def test_unique(df_factory): ds = df_factory(colors=['red', 'green', 'blue', 'green']) with small_buffer(ds, 2): assert set(ds.unique(ds.colors)) == {'red', 'green', 'blue'} values, index = ds.unique(ds.colors, return_inverse=True) assert np.array(values)[index].tolist() == ds.colors.tolist() ds = df_factory(x=['a', 'b', 'a', 'a', 'a', 'b', 'b', 'b', 'b', 'a']) with small_buffer(ds, 2): assert set(ds.unique(ds.x)) == {'a', 'b'} values, index = ds.unique(ds.x, return_inverse=True) assert np.array(values)[index].tolist() == ds.x.tolist()
def test_unique(): ds = vaex.from_arrays(colors=['red', 'green', 'blue', 'green']) with small_buffer(ds, 2): assert set(ds.unique(ds.colors).to_pylist()) == {'red', 'green', 'blue'} values, index = ds.unique(ds.colors, return_inverse=True) assert np.array(values)[index].tolist() == ds.colors.tolist() ds = vaex.from_arrays(x=['a', 'b', 'a', 'a', 'a', 'b', 'b', 'b', 'b', 'a']) with small_buffer(ds, 2): assert set(ds.unique(ds.x).to_pylist()) == {'a', 'b'} values, index = ds.unique(ds.x, return_inverse=True) assert np.array(values)[index].tolist() == ds.x.tolist()
def test_unique_arrow(df_factory): ds = df_factory(x=vaex.string_column( ['a', 'b', 'a', 'a', 'a', 'b', 'b', 'b', 'b', 'a'])) with small_buffer(ds, 2): assert set(ds.unique(ds.x)) == {'a', 'b'} values, index = ds.unique(ds.x, return_inverse=True) assert np.array(values)[index].tolist() == ds.x.tolist()
def test_cache(buffer_size): x = np.arange(10) y = x**2 z = 2*x df = vaex.from_arrays(x=x, y=y, z=z) # first test the dataset interace cache = {} ds = vaex.dataset.DatasetCached(df.dataset, ['x', 'y'], cache) iter = ds.chunk_iterator(['x', 'y'], chunk_size=2) for i in range(5): i1, i2, chunks = next(iter) assert i1 == i*2 assert i2 == (i + 1) * 2 chunks['x'].tolist() == x[i1:i2].tolist() chunks['y'].tolist() == y[i1:i2].tolist() # more of an integration test df['q'] = df.x + df.y with small_buffer(df, 3): cache = {} df.dataset = vaex.dataset.DatasetCached(df.dataset, ['x', 'y'], cache) df.z.sum() assert len(cache) == 0 df.x.sum() # it will also fill up due to dtype evaluation assert len(cache) == 2 df.q.sum() assert len(cache) == 4 df.y.sum() assert len(cache) == 4
def test_unique_nan(df_factory): x = [np.nan, 0, 1, np.nan, 2, np.nan] df = df_factory(x=x) assert list(sorted(df.x.unique()))[1:] == [np.nan, 0, 1, 2][1:] with small_buffer(df, 2): values, indices = df.unique(df.x, return_inverse=True) values = values[indices] mask = np.isnan(values) assert values[~mask].tolist() == df.x.to_numpy()[~mask].tolist()
def test_unique_nan(): x = [np.nan, 0, 1, np.nan, 2, np.nan] df = vaex.from_arrays(x=x) assert list(sorted(df.x.unique()))[1:] == [np.nan, 0, 1, 2][1:] with small_buffer(df, 2): values, indices = df.unique(df.x, return_inverse=True) values = values[indices] mask = np.isnan(values) assert values[~mask].tolist() == df.x.values[~mask].tolist()
def test_unique_nan(df_factory): x = [np.nan, 0, 1, np.nan, 2, np.nan] df = df_factory(x=x) assert set(df.x.unique(dropnan=True)) == {0, 1, 2} assert dropnan(set(df.x.unique()), expect=1) == {0, 1, 2} with small_buffer(df, 2): values, indices = df.unique(df.x, return_inverse=True) values = np.array(values) values = values[indices] mask = np.isnan(values) assert values[~mask].tolist() == df.x.to_numpy()[~mask].tolist()
def test_with_masked_no_short_circuit(): # this test that the full table is joined, in some rare condition # it can happen that the left table has a value not present in the right # which causes it to not evaluate the other lookups, due to Python's short circuit # behaviour. E.g. True or func() will not call func N = 1000 df = vaex.from_arrays(i=np.arange(100) % 10) df_right = vaex.from_arrays(i=np.arange(9), j=np.arange(9)) with small_buffer(df, size=1): dfj = df.join(other=df_right, on='i') assert dfj.dataset.right._columns['j'].masked assert dfj[:10].dataset.right._columns['j'].masked assert dfj['j'][:10].tolist() == [0, 1, 2, 3, 4, 5, 6, 7, 8, None] dfj['j'].tolist() # make sure we can evaluate the whole column
def test_thread_safe(df_local): df = df_local # but an executor should be thread save def do(): return df_local.count(df.x) # enters the executor from a thread count = df_local.count(df.x) tpe = ThreadPoolExecutor(4) futures = [] passes = df.executor.passes N = 100 with small_buffer(df): for i in range(N): futures.append(tpe.submit(do)) concurrent.futures.wait(futures) for future in futures: assert count == future.result() assert df.executor.passes <= passes + N
async def test_async_safe(df_local): df = df_local with vaex.cache.off(): async def do(): promise = df.x.count(delay=True) import random r = random.random() * 0.01 await asyncio.sleep(r) await df.execute_async() return await promise awaitables = [] passes = df.executor.passes = 0 N = 1000 with small_buffer(df): for i in range(N): awaitables.append(do()) import asyncio values = await asyncio.gather(*awaitables) assert df.executor.passes < N
def test_thread_safe(df_local): with vaex.cache.off(): df = df_local # but an executor should be thread save def do(): return df_local.count(df.x) # enters the executor from a thread count = df_local.count(df.x) tpe = ThreadPoolExecutor(4) futures = [] passes = df.executor.passes N = 100 with small_buffer(df): for i in range(N): futures.append(tpe.submit(do)) done, not_done = concurrent.futures.wait( futures, return_when=concurrent.futures.FIRST_EXCEPTION) for future in done: assert count == future.result() assert df.executor.passes <= passes + N