Example #1
0
def test_unique(df_factory):
    ds = df_factory(colors=['red', 'green', 'blue', 'green'])
    with small_buffer(ds, 2):
        assert set(ds.unique(ds.colors)) == {'red', 'green', 'blue'}
        values, index = ds.unique(ds.colors, return_inverse=True)
        assert np.array(values)[index].tolist() == ds.colors.tolist()

    ds = df_factory(x=['a', 'b', 'a', 'a', 'a', 'b', 'b', 'b', 'b', 'a'])
    with small_buffer(ds, 2):
        assert set(ds.unique(ds.x)) == {'a', 'b'}
        values, index = ds.unique(ds.x, return_inverse=True)
        assert np.array(values)[index].tolist() == ds.x.tolist()
Example #2
0
def test_unique():
    ds = vaex.from_arrays(colors=['red', 'green', 'blue', 'green'])
    with small_buffer(ds, 2):
        assert set(ds.unique(ds.colors).to_pylist()) == {'red', 'green', 'blue'}
        values, index = ds.unique(ds.colors, return_inverse=True)
        assert np.array(values)[index].tolist() == ds.colors.tolist()

    ds = vaex.from_arrays(x=['a', 'b', 'a', 'a', 'a', 'b', 'b', 'b', 'b', 'a'])
    with small_buffer(ds, 2):
        assert set(ds.unique(ds.x).to_pylist()) == {'a', 'b'}
        values, index = ds.unique(ds.x, return_inverse=True)
        assert np.array(values)[index].tolist() == ds.x.tolist()
Example #3
0
def test_unique_arrow(df_factory):
    ds = df_factory(x=vaex.string_column(
        ['a', 'b', 'a', 'a', 'a', 'b', 'b', 'b', 'b', 'a']))
    with small_buffer(ds, 2):
        assert set(ds.unique(ds.x)) == {'a', 'b'}
        values, index = ds.unique(ds.x, return_inverse=True)
        assert np.array(values)[index].tolist() == ds.x.tolist()
Example #4
0
def test_cache(buffer_size):
    x = np.arange(10)
    y = x**2
    z = 2*x
    df = vaex.from_arrays(x=x, y=y, z=z)

    # first test the dataset interace
    cache = {}
    ds = vaex.dataset.DatasetCached(df.dataset, ['x', 'y'], cache)
    iter = ds.chunk_iterator(['x', 'y'], chunk_size=2)
    for i in range(5):
        i1, i2, chunks = next(iter)
        assert i1 == i*2
        assert i2 == (i + 1) * 2
        chunks['x'].tolist() == x[i1:i2].tolist()
        chunks['y'].tolist() == y[i1:i2].tolist()

    # more of an integration test
    df['q'] = df.x + df.y
    with small_buffer(df, 3):
        cache = {}
        df.dataset = vaex.dataset.DatasetCached(df.dataset, ['x', 'y'], cache)
        df.z.sum()
        assert len(cache) == 0
        df.x.sum()
        # it will also fill up due to dtype evaluation
        assert len(cache) == 2
        df.q.sum()
        assert len(cache) == 4
        df.y.sum()
        assert len(cache) == 4
Example #5
0
def test_unique_nan(df_factory):
    x = [np.nan, 0, 1, np.nan, 2, np.nan]
    df = df_factory(x=x)
    assert list(sorted(df.x.unique()))[1:] == [np.nan, 0, 1, 2][1:]
    with small_buffer(df, 2):
        values, indices = df.unique(df.x, return_inverse=True)
        values = values[indices]
        mask = np.isnan(values)
        assert values[~mask].tolist() == df.x.to_numpy()[~mask].tolist()
Example #6
0
def test_unique_nan():
    x = [np.nan, 0, 1, np.nan, 2, np.nan]
    df = vaex.from_arrays(x=x)
    assert list(sorted(df.x.unique()))[1:] == [np.nan, 0, 1, 2][1:]
    with small_buffer(df, 2):
        values, indices = df.unique(df.x, return_inverse=True)
        values = values[indices]
        mask = np.isnan(values)
        assert values[~mask].tolist() == df.x.values[~mask].tolist()
Example #7
0
def test_unique_nan(df_factory):
    x = [np.nan, 0, 1, np.nan, 2, np.nan]
    df = df_factory(x=x)
    assert set(df.x.unique(dropnan=True)) == {0, 1, 2}
    assert dropnan(set(df.x.unique()), expect=1) == {0, 1, 2}
    with small_buffer(df, 2):
        values, indices = df.unique(df.x, return_inverse=True)
        values = np.array(values)
        values = values[indices]
        mask = np.isnan(values)
        assert values[~mask].tolist() == df.x.to_numpy()[~mask].tolist()
Example #8
0
def test_with_masked_no_short_circuit():
    # this test that the full table is joined, in some rare condition
    # it can happen that the left table has a value not present in the right
    # which causes it to not evaluate the other lookups, due to Python's short circuit
    # behaviour. E.g. True or func() will not call func
    N = 1000
    df = vaex.from_arrays(i=np.arange(100) % 10)
    df_right = vaex.from_arrays(i=np.arange(9), j=np.arange(9))
    with small_buffer(df, size=1):
        dfj = df.join(other=df_right, on='i')
    assert dfj.dataset.right._columns['j'].masked
    assert dfj[:10].dataset.right._columns['j'].masked
    assert dfj['j'][:10].tolist() == [0, 1, 2, 3, 4, 5, 6, 7, 8, None]
    dfj['j'].tolist()  # make sure we can evaluate the whole column
Example #9
0
def test_thread_safe(df_local):
    df = df_local

    # but an executor should be thread save
    def do():
        return df_local.count(df.x)  # enters the executor from a thread

    count = df_local.count(df.x)
    tpe = ThreadPoolExecutor(4)
    futures = []

    passes = df.executor.passes
    N = 100
    with small_buffer(df):
        for i in range(N):
            futures.append(tpe.submit(do))

    concurrent.futures.wait(futures)
    for future in futures:
        assert count == future.result()
    assert df.executor.passes <= passes + N
Example #10
0
async def test_async_safe(df_local):
    df = df_local
    with vaex.cache.off():

        async def do():
            promise = df.x.count(delay=True)
            import random
            r = random.random() * 0.01
            await asyncio.sleep(r)
            await df.execute_async()
            return await promise

        awaitables = []
        passes = df.executor.passes = 0
        N = 1000
        with small_buffer(df):
            for i in range(N):
                awaitables.append(do())
        import asyncio
        values = await asyncio.gather(*awaitables)
        assert df.executor.passes < N
Example #11
0
def test_thread_safe(df_local):
    with vaex.cache.off():
        df = df_local

        # but an executor should be thread save
        def do():
            return df_local.count(df.x)  # enters the executor from a thread

        count = df_local.count(df.x)
        tpe = ThreadPoolExecutor(4)
        futures = []

        passes = df.executor.passes
        N = 100
        with small_buffer(df):
            for i in range(N):
                futures.append(tpe.submit(do))

        done, not_done = concurrent.futures.wait(
            futures, return_when=concurrent.futures.FIRST_EXCEPTION)
        for future in done:
            assert count == future.result()
        assert df.executor.passes <= passes + N