def test_count_cat(): ds0 = vaex.from_arrays(colors=['red', 'green', 'blue', 'green'], counts=[1, 2, 3, 4]) ds0 = vaex.from_arrays(colors=['red', 'green', 'blue', 'green'], names=['apple', 'apple', 'berry', 'apple']) ds = ds0.ordinal_encode(ds0.colors) ds = ds0.ordinal_encode(ds0.names) ds = ds0.ordinal_encode('colors', ['red', 'green', 'blue']) assert ds.count(binby=ds.colors).tolist() == [1, 2, 1] ds = ds0.ordinal_encode('colors', ['red', 'blue', 'green', ], inplace=True) assert ds.count(binby=ds.colors).tolist() == [1, 1, 2]
def test_cat_missing_values(): colors = ['red', 'green', 'blue', 'green', 'MISSING'] mask = [False, False, False, False, True] colors = np.ma.array(colors, mask=mask) ds0 = vaex.from_arrays(colors=colors) ds = ds0.ordinal_encode('colors', ['red', 'green', 'blue']) assert ds.count(binby=ds.colors, edges=True).tolist() == [1, 0, 1, 2, 1, 0]
def test_expr(): ar = np.zeros((10, 2)).reshape(20) x = ar[::2] x[:] = np.arange(10) ds = vaex.from_arrays(x=x) counts = ds.count('x*2', binby='x*2', limits=[-0.5, 19.5], shape=10) assert counts.tolist() == np.ones(10).tolist()
def test_groupby_count_string(): g = np.array([0, 0, 0, 0, 1, 1, 1, 1, 2, 2]) s = np.array(list(map(str, [0, 0, 0, 0, 1, 1, 1, 1, 2, 2]))) df = vaex.from_arrays(g=g, s=s) groupby = df.groupby('s') dfg = groupby.agg({'c': vaex.agg.count('s')}) assert dfg.s.tolist() == ['0', '1', '2'] assert dfg.c.tolist() == [4, 4, 2]
def test_big_endian_binning_non_contiguous(): x = np.arange(20, dtype='>f8')[::2] x[:] = np.arange(10, dtype='>f8') y = np.arange(20, dtype='>f8')[::2] y[:] = np.arange(10, dtype='>f8') ds = vaex.from_arrays(x=x, y=y) counts = ds.count(binby=[ds.x, ds.y], limits=[[-0.5, 9.5], [-0.5, 9.5]], shape=[10, 10]) assert np.diagonal(counts).tolist() == np.ones(10).tolist()
def test_count_1d(): x = np.array([-1, -2, 0.5, 1.5, 4.5, 5], dtype='f8') df = vaex.from_arrays(x=x) bins = 5 binner = df._binner_scalar('x', [0, 5], bins) grid = vaex.superagg.Grid([binner]) agg = vaex.agg.count() grid = df._agg(agg, grid) assert grid.tolist() == [0, 2, 1, 1, 0, 0, 1, 1]
def test_count_1d_ordinal(): x = np.array([-1, -2, 0, 1, 4, 5], dtype='i8') df = vaex.from_arrays(x=x) bins = 5 binner = df._binner_ordinal('x', 5) grid = vaex.superagg.Grid([binner]) agg = vaex.agg.count() grid = df._agg(agg, grid) assert grid.tolist() == [0, 2, 1, 1, 0, 0, 1, 1]
def test_groupby_datetime_quarter(): t = np.arange('2015-01-01', '2016-01-02', dtype=np.datetime64) y = np.arange(len(t)) df = vaex.from_arrays(t=t, y=y) dfg = df.groupby(vaex.BinnerTime.per_quarter(df.t)).agg({'y': 'sum'}) values = dfg.y.tolist() assert len(values) == 5 assert sum(values) == sum(y)
def test_plain_strings(): N = 4 x = np.array(['a', 'bb', 'ccc', 'dddd'], dtype='object') df = vaex.from_arrays(x=x) assert len(df.columns['x']) == 4 trimmed = df.columns['x'][2:4] assert trimmed[:].tolist() == x[2:4].tolist() assert len(df) == N assert len(df[1:3]) == 2 assert df[1:3].x.tolist() == x[1:3].tolist()
def test_arrow_strings(): N = 4 x = ['a', 'bb', 'ccc', 'dddd'] xc = vaex.string_column(x) df = vaex.from_arrays(x=xc) assert len(df.columns['x']) == 4 trimmed = df.columns['x'][2:4] assert trimmed[:].tolist() == x[2:4] assert len(df) == N assert len(df[1:3]) == 2 assert df[1:3].x.tolist() == x[1:3]
def test_cat_string(): ds0 = vaex.from_arrays(colors=['red', 'green', 'blue', 'green']) ds = ds0.ordinal_encode('colors') #, ['red', 'green'], inplace=True) assert ds.is_category('colors') assert ds.limits('colors', shape=128) == ([-0.5, 2.5], 3) ds = ds0.ordinal_encode('colors', values=['red', 'green']) assert ds.is_category('colors') assert ds.limits('colors', shape=128) == ([-0.5, 1.5], 2) assert ds.data.colors.tolist() == [0, 1, None, 1] assert ds.copy().is_category(ds.colors)
def test_vrange(): N = 1000**3 df = vaex.from_arrays(x=vaex.vrange(0, N)) assert len(df.columns['x']) == N trimmed = df.columns['x'].trim(2, 4) assert trimmed.start == 2 assert trimmed.stop == 4 assert len(df) == N assert len(df[0:10]) == 10 assert df[1:11].x.tolist() == (np.arange(1, 11.)).tolist() df['y'] = df.x**2 assert df[1:11].y.tolist() == (np.arange(1, 11)**2).tolist()
def test_groupby_count(): # ds = ds_local.extract() g = np.array([0, 0, 0, 0, 1, 1, 1, 1, 2, 2], dtype='int32') s = np.array(list(map(str, [0, 0, 0, 0, 1, 1, 1, 1, 2, 2]))) df = vaex.from_arrays(g=g, s=s) groupby = df.groupby('s') dfg = groupby.agg({'g': 'mean'}).sort('s') assert dfg.s.tolist() == ['0', '1', '2'] assert dfg.g.tolist() == [0, 1, 2] dfg2 = df.groupby('s', {'g': 'mean'}).sort('s') assert dfg._equals(dfg2)
def test_groupby_datetime(): t = np.arange('2015-01-01', '2015-02-01', dtype=np.datetime64) y = np.arange(len(t)) df = vaex.from_arrays(t=t, y=y) dfg = df.groupby(vaex.BinnerTime.per_week(df.t), agg={'y': 'sum'}) assert dfg.y.tolist() == [y[k * 7:(k + 1) * 7].sum() for k in range(5)] # other syntax dfg = df.groupby(vaex.BinnerTime.per_week(df.t)).agg({'y': 'sum'}) assert dfg.y.tolist() == [y[k * 7:(k + 1) * 7].sum() for k in range(5)]
def test_groupby_count(): # ds = ds_local.extract() g = np.array([0, 0, 0, 0, 1, 1, 1, 1, 0, 1], dtype='int32') s = np.array(list(map(str, [0, 0, 0, 0, 1, 1, 1, 1, 2, 2]))) df = vaex.from_arrays(g=g, s=s) groupby = df.groupby('s') dfg = groupby.agg({'g': 'mean'}).sort('s') assert dfg.s.tolist() == ['0', '1', '2'] assert dfg.g.tolist() == [0, 1, 0.5] dfg2 = df.groupby('s', {'g': 'mean'}).sort('s') assert dfg._equals(dfg2)
def dfs(alpha, delta, pm_a, pm_d, radians=radians): ds_1 = vaex.from_scalars(alpha=alpha, delta=delta, pm_a=pm_a, pm_d=pm_d, alpha_e=0.01, delta_e=0.02, pm_a_e=0.003, pm_d_e=0.004) ds_1 = ds_1.astro.pm_eq2gal("alpha", "delta", "pm_a", "pm_d", "pm_l", "pm_b", propagate_uncertainties=True, radians=radians) N = 100000 # distance alpha = np.random.normal(0, 0.01, N) + alpha delta = np.random.normal(0, 0.02, N) + delta pm_a = np.random.normal(0, 0.003, N) + pm_a pm_d = np.random.normal(0, 0.004, N) + pm_d ds_many = vaex.from_arrays(alpha=alpha, delta=delta, pm_a=pm_a, pm_d=pm_d) ds_many.astro.pm_eq2gal("alpha", "delta", "pm_a", "pm_d", "pm_l", "pm_b", radians=radians, inplace=True) return ds_1, ds_many
def test_format(): num1 = np.array([1, 2, 3], dtype=np.int32) num2 = np.array([1.1, 2.2, 3.3], dtype=np.float32) text = ['Here', 'we', 'go'] df = vaex.from_arrays(num1=num1, num2=num2, text=text) assert df.num1.format("%d").tolist() == ['1', '2', '3'] assert df.num1.format("%04d").tolist() == ['0001', '0002', '0003'] assert df.num2.format('%f').tolist() == ['1.100000', '2.200000', '3.300000'] assert df.num2.format('%05.2f').tolist() == ['01.10', '02.20', '03.30'] assert df.text.format('pre-%s-post').tolist() == ['pre-%s-post' % k for k in text]
def test_vrange(): N = 1000**3 df = vaex.from_arrays(x=vaex.vrange(0, N)) assert len(df.columns['x']) == N trimmed = df.columns['x'].trim(2,4) assert trimmed.start == 2 assert trimmed.stop == 4 assert len(df) == N assert len(df[0:10]) == 10 assert df[1:11].x.tolist() == (np.arange(1, 11.)).tolist() df['y'] = df.x**2 assert df[1:11].y.tolist()== (np.arange(1, 11)**2).tolist()
def test_countna(): x = np.array([5, '', 1, 4, None, 6, np.nan, np.nan, 10, '', 0, 0, -13.5]) y_data = np.array([np.nan, 2, None, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]) y_mask = np.array([0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1]) y = np.ma.MaskedArray(data=y_data, mask=y_mask) df = vaex.from_arrays(x=x, y=y) pandas_df = df.to_pandas_df() assert df.x.countna() == pandas_df.x.isna().sum() assert df.y.countna() == pandas_df.y.isna().sum() assert df.x.countnan() == 2 assert df.y.countmissing() == 6
def test_export_json(): temp_path = './temp.json' ds = vaex.from_arrays(**{'A': [1, 2, 3], 'B': ['1', '2', '3']}) vaex.utils.write_json_or_yaml(temp_path, ds.to_dict(array_type='python')) with open(temp_path, 'r') as f: data = json.load(f) os.remove(temp_path) assert 'A' in data assert len(data['A']) == 3 assert data['B'][0] == '1'
def test_merge_same_aggregation_tasks(): df = vaex.from_arrays(x=[1, 2], y=[2, 3]) binners = df._create_binners('x', [0.5, 2.5], 2) binners2 = df._create_binners('x', [0.5, 2.5], 2) assert len(binners) == 1 # these two aggregations should be merged into 1 subtask [task1], result1 = vaex.agg.count().add_tasks(df, binners) [task2], result2 = vaex.agg.count().add_tasks(df, binners) assert len(df.executor.tasks) == 1 df.execute() assert task1 is task2 assert np.all(result1.get() == result2.get())
def test_groupby_datetime(): t = np.arange('2015-01-01', '2015-02-01', dtype=np.datetime64) y = np.arange(len(t)) df = vaex.from_arrays(t=t, y=y) dfg = df.groupby(vaex.BinnerTime.per_week(df.t), agg={'y': 'sum'}) assert dfg.y.tolist() == [y[k*7:(k+1)*7].sum() for k in range(5)] # other syntax dfg = df.groupby(vaex.BinnerTime.per_week(df.t)).agg({'y': 'sum'}) assert dfg.y.tolist() == [y[k*7:(k+1)*7].sum() for k in range(5)]
def test_agg_filtered_df_invalid_data(): # Custom function to be applied to a filtered DataFrame def custom_func(x): assert 4 not in x return x**2 df = vaex.from_arrays(x=np.arange(10)) df_filtered = df[df.x != 4] df_filtered.add_function('custom_function', custom_func) df_filtered['y'] = df_filtered.func.custom_function(df_filtered.x) # assert df_filtered.y.tolist() == [0, 1, 4, 9, 25, 36, 49, 64, 81] assert df_filtered.count(df_filtered.y) == 9
def test_cat_string(): ds0 = vaex.from_arrays(colors=['red', 'green', 'blue', 'green']) ds = ds0.ordinal_encode('colors')#, ['red', 'green'], inplace=True) assert ds.is_category('colors') assert ds.limits('colors', shape=128) == ([-0.5, 2.5], 3) ds = ds0.ordinal_encode('colors', values=['red', 'green']) assert ds.is_category('colors') assert ds.limits('colors', shape=128) == ([-0.5, 1.5], 2) assert ds.data.colors.tolist() == [0, 1, None, 1] assert ds.copy().is_category(ds.colors)
def test_export_json(): temp_path = './temp.json' ds = vaex.from_arrays(**{'A': [1, 2, 3], 'B': ['1', '2', '3']}) vaex.utils.write_json_or_yaml(temp_path, ds.to_dict()) with open(temp_path, 'r') as f: data = json.load(f) os.remove(temp_path) assert 'A' in data assert len(data['A']) == 3 assert data['B'][0] == '1'
def test_cyclical_transformer(tmpdir): df_train = vaex.from_arrays(hour=[0, 3, 6]) df_test = vaex.from_arrays(hour=[12, 24, 21, 15]) trans = vaex.ml.CycleTransformer(n=24, features=['hour'], prefix_x='pref_', prefix_y='pref_') df_train = trans.fit_transform(df_train) np.testing.assert_array_almost_equal(df_train.pref_hour_x.values, [1, 0.707107, 0]) np.testing.assert_array_almost_equal(df_train.pref_hour_y.values, [0, 0.707107, 1]) state_path = str(tmpdir.join('state.json')) df_train.state_write(state_path) df_test.state_load(state_path) np.testing.assert_array_almost_equal(df_test.pref_hour_x.values, [-1, 1, 0.707107, -0.707107]) np.testing.assert_array_almost_equal(df_test.pref_hour_y.values, [0, 0, -0.707107, -0.707107])
def test_count_basics(df): # df = df_l y = df.y.to_numpy() x = df.x.to_numpy() counts = df.count(binby=df.x, limits=[0,10], shape=10) assert len(counts) == 10 assert all(counts == 1), "counts is %r" % counts sums = df["y"].sum(binby=df.x, limits=[0,10], shape=10) assert len(sums) == 10 assert(all(sums == y)) df.select("x < 5") mask = x < 5 counts = df["x"].count(binby=df.x, limits=[0,10], shape=10, selection=True) mod_counts = counts * 1. mod_counts[~mask] = 0 assert(all(counts == mod_counts)) mod_sums = y * 1. mod_sums[~mask] = 0 sums = df["y"].sum(binby=df.x, limits=[0,10], shape=10, selection=True) assert(all(sums == mod_sums)) # TODO: we may want to test this for a remote df # 2d x = np.array([0, 1, 0, 1]) y = np.array([0, 0, 1, 1]) df = vaex.from_arrays(x=x, y=y) counts = df.count(binby=[df.x, df.y], limits=[[0.,2.], [0.,2.]], shape=2) assert np.all(counts == 1) # 3d x = np.array([0, 1, 0, 1, 0, 1, 0, 1]) y = np.array([0, 0, 1, 1, 0, 0, 1, 1]) z = np.array([0, 0, 0, 0, 1, 1, 1, 1]) df = vaex.from_arrays(x=x, y=y, z=z) counts = df.count(binby=[df.x, df.y, df.z], limits=[[0.,2.], [0.,2.], [0.,2.]], shape=2) assert np.all(counts == 1)
async def test_auto_execute(): df = vaex.from_arrays(x=[2, 4]) async def means(): count, sum = await asyncio.gather(df.x.count(delay=True), df.x.sum(delay=True)) mean = await df.x.mean(delay=True) return sum / count, mean async with df.executor.auto_execute(): mean1, mean2 = await means() assert mean1 == 3 assert mean2 == 3
def test_nunique(): s = ['aap', 'aap', 'noot', 'mies', None, 'mies', 'kees', 'mies', 'aap'] x = [0, 0, 0, 0, 0, 1, 1, 1, 2] df = vaex.from_arrays(x=x, s=s) dfg = df.groupby(df.x, agg={'nunique': vaex.agg.nunique(df.s)}).sort(df.x) items = list(zip(dfg.x.values, dfg.nunique.values)) assert items == [(0, 4), (1, 2), (2, 1)] dfg = df.groupby(df.x, agg={'nunique': vaex.agg.nunique(df.s, dropmissing=True)}).sort(df.x) items = list(zip(dfg.x.values, dfg.nunique.values)) assert items == [(0, 3), (1, 2), (2, 1)] mapping = {'aap': 1.2, 'noot': 2.5, 'mies': 3.7, 'kees': 4.8, None: np.nan} s = np.array([mapping[k] for k in s], dtype=np.float64) df = vaex.from_arrays(x=x, s=s) dfg = df.groupby(df.x, agg={'nunique': vaex.agg.nunique(df.s)}).sort(df.x) items = list(zip(dfg.x.values, dfg.nunique.values)) assert items == [(0, 4), (1, 2), (2, 1)] dfg = df.groupby(df.x, agg={'nunique': vaex.agg.nunique(df.s, dropnan=True)}).sort(df.x) items = list(zip(dfg.x.values, dfg.nunique.values)) assert items == [(0, 3), (1, 2), (2, 1)]
def test_weight_of_evidence_encoder_edge_cases(): y = [1, 0, 1, 0, 1, 0, 0, 0, 1, 1] x = ['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'd', 'd'] df = vaex.from_arrays(x=x, y=y) woe_encoder = vaex.ml.WeightOfEvidenceEncoder(features=['x'], target='y', unseen='zero') df = woe_encoder.fit_transform(df) expected_values = [0.69314, 0.69314, 0.69314, -0.69314, -0.69314, -0.69314, -13.81550, -13.81550, 13.81551, 13.81551] np.testing.assert_array_almost_equal(df.woe_encoded_x.tolist(), expected_values, decimal=5)
def test_delay_ordinal(binby): df = vaex.from_arrays(x=[1, 2, 2, 3, 3, 3], s=["aap", "aap", "aap", "noot", "noot", "mies"]) df.ordinal_encode("x", inplace=True) df.ordinal_encode("s", inplace=True) df.executor.passes = 0 if binby: ar1 = df.binby('x', agg='count', delay=True) ar2 = df.binby('s', agg='count', delay=True) else: df1 = df.groupby('x', agg='count', delay=True) df2 = df.groupby('s', agg='count', delay=True) df.execute() assert df.executor.passes == 1
def test_is_missing(): s = vaex.string_column(["aap", None, "noot", "mies"]) o = ["aap", None, "noot", np.nan] x = np.arange(4, dtype=np.float64) x[2] = x[3] = np.nan m = np.ma.array(x, mask=[0, 1, 0, 1]) df = vaex.from_arrays(x=x, m=m, s=s, o=o) assert (df.x.ismissing().tolist() == [False, False, False, False]) assert (df.m.ismissing().tolist() == [False, True, False, True]) assert (df.s.ismissing().tolist() == [False, True, False, False]) assert (df.o.ismissing().tolist() == [False, True, False, False]) assert (df.m.notmissing().tolist() == [not k for k in [False, True, False, True]])
def test_df_selection_references_virtual_column(): x = np.arange(10, dtype='i4') y = x**2 df = vaex.from_arrays(x=x, y=y) df1 = df.copy() df1['z'] = df1.x + df1.y df2 = df.copy() df2['z'] = df1.x - df1.y df1f = df1[df1.z < 5] df2f = df2[df2.z < 5] fp1 = df1f.fingerprint(dependencies=['x', 'y']) fp2 = df2f.fingerprint(dependencies=['x', 'y']) assert fp1 != fp2
def webserver(): webserver = vaex.webserver.WebServer(datasets=[], port=test_port, cache_byte_size=0, token='token', token_trusted='token_trusted') x = np.arange(10) df = vaex.from_arrays(x=x) df.name = 'df' webserver.set_datasets([df]) webserver.serve_threaded() yield webserver webserver.stop_serving()
def test_label_encoder(): # Create sample data x1 = np.array(['dog', 'cat', 'mouse', 'mouse', 'dog', 'dog', 'dog', 'cat', 'cat', 'mouse', 'dog']) x2 = np.array(['dog', 'dog', 'cat', 'cat', 'mouse']) x3 = np.array(['mouse', 'dragon', 'dog', 'dragon']) # unseen value 'dragon' y1 = np.array([1, 2, 2, 2, 3, 1, 2, 3, 5, 5, 1]) y2 = np.array([3, 3, 1, 3, 2]) y3 = np.array([3, 2, 1, 4]) # unseen value 4 # Create df_train = vaex.from_arrays(x=x1, y=y1) df_test = vaex.from_arrays(x=x2, y=y2) df_unseen = vaex.from_arrays(x=x3, y=y3) # # Label Encode with vaex.ml label_encoder = df_train.ml.label_encoder(features=['x', 'y'], prefix='mypref_') # Assertions: makes sure that the categories are correctly identified: assert set(list(label_encoder.labels_['x'].keys())) == set(np.unique(x1)) assert set(list(label_encoder.labels_['y'].keys())) == set(np.unique(y1)) # Transform df_train = label_encoder.transform(df_train) df_test = label_encoder.transform(df_test) # Make asserssions on the "correctness" of the implementation by "manually" applying the labels to the categories assert df_test.x.apply(lambda elem: label_encoder.labels_['x'][elem]).tolist() == df_test.mypref_x.tolist() assert df_test.y.apply(lambda elem: label_encoder.labels_['y'][elem]).tolist() == df_test.mypref_y.tolist() # Try to get labels from the dd dataset unseen categories with pytest.raises(ValueError): label_encoder.transform(df_unseen) # Now try again, but allow for unseen categories label_encoder = df_train.ml.label_encoder(features=['x', 'y'], prefix='mypref_', allow_unseen=True) df_unseen = label_encoder.transform(df_unseen) assert set(df_unseen[df_unseen.x == 'dragon'].mypref_x.tolist()) == {-1} assert set(df_unseen[df_unseen.y == 4].mypref_x.tolist()) == {-1}
def test_agg_binary(): x = np.arange(5) df = vaex.from_arrays(x=x, y=x+1, g=x//4) agg = vaex.agg.sum('x') / vaex.agg.sum('y') assert repr(agg) == "(vaex.agg.sum('x') / vaex.agg.sum('y'))" assert df.groupby('g', agg={'total': agg})['total'].tolist() == [6 / 10, 4 / 5] agg = vaex.agg.sum('x') + 99 assert repr(agg) == "(vaex.agg.sum('x') + 99)" assert df.groupby('g', agg={'total': agg})['total'].tolist() == [6 + 99, 4 + 99] agg = 99 + vaex.agg.sum('y') assert repr(agg) == "(99 + vaex.agg.sum('y'))" assert df.groupby('g', agg={'total': agg})['total'].tolist() == [99 + 10, 99 + 5] assert df.groupby('g', agg={'total': vaex.agg.sum('x') / 2})['total'].tolist() == [6/2, 4/2] assert df.groupby('g', agg={'total': 2/vaex.agg.sum('x')})['total'].tolist() == [2/6, 2/4]
def test_passes_mixed_filtering(): x = np.arange(10) df = vaex.from_arrays(x=x, y=x**2) df1 = df[df.x < 4] df2 = df executor = df.executor executor.passes = 0 result1 = df1.sum('x', delay=True) result2 = df2.sum('x', delay=True) df.execute() assert executor.passes == 1 assert result1.get() == 1 + 2 + 3 assert result2.get() == 45
def test_vconstant(value): length = 100 vc = vaex.vconstant(value=value, length=length) df = vaex.from_arrays(x=vc, y=vaex.vrange(0, length)) assert len(df.columns['x']) == length assert df.x[:3].tolist() == [value] * 3 assert len(vc[:]) == 100 assert len(vc[10:]) == 90 assert len(vc[:20]) == 20 df_filter = df[df.y < 31] assert len(df_filter) == 31 assert df_filter.x[:3].tolist() == [value] * 3
def test_agg_selections_equal(): x = np.array([0, 0, 0, 1, 1, 2, 2]) y = np.array([1, 3, 5, 1, 7, 1, -1]) z = np.array([0, 2, 3, 4, 5, 6, 7]) w = np.array(['dog', 'cat', 'mouse', 'dog', 'dog', 'mouse', 'cat']) df = vaex.from_arrays(x=x, y=y, z=z, w=w) df_grouped = df.groupby(df.x).agg({'counts': vaex.agg.count(), 'sel_counts': vaex.agg.count(selection=df.y==1.) }) assert df_grouped['counts'].tolist() == [3, 2, 2] assert df_grouped['sel_counts'].tolist() == [1, 1, 1]
def test_map_basics(): # Generate the test data colour = ['red', 'red', 'blue', 'red', 'green', 'green', 'red', 'blue', 'blue', 'green'] animal = np.array(['dog', 'cat', 'dog', 'dog', 'dog', 'dog', 'cat', 'dog', 'dog', np.nan], dtype='O') number = [10, 20, 30, 10, 20, 30, 30, 30, 10, 20] floats = [10., 20., 30., 10., 20., 30., 30., 30., 10., np.nan] ds = vaex.from_arrays(colour=colour, animal=animal, number=number, floats=floats) df = pd.DataFrame(data=np.array([colour, animal, number, floats]).T, columns=['colour', 'animal', 'number', 'floats']) # Create a mapper - dictionary mapper = {} mapper['colour'] = {'red': 1, 'blue': 2, 'green': 3} mapper['animal'] = {'dog': 5, 'cat': -1, 'dolphin': 0} mapper['number'] = {10: 1, 20: 2, 30: 3} mapper['floats'] = {10.: -1, 20.: -2, 30.: -3, np.nan: -4} # Map the functions in vaex ds['colour_'] = ds.colour.map(mapper['colour']) ds['animal_'] = ds.animal.map(mapper['animal']) # ds['number_'] = ds.number.map(lambda x: mapper['number'][x]) # test with a function, not just with a dict ds['floats_'] = ds.floats.map(mapper['floats'], nan_value=np.nan) # Map in pandas df['colour_'] = df.colour.map(mapper['colour']) df['animal_'] = df.animal.map(mapper['animal']) # Make assertions - compare to pandas for string columns # we deviate from pandas, we can map nan to something assert ds.colour_.values.tolist()[:-1] == df.colour_.values.tolist()[:-1] assert ds.animal_.values.tolist()[:-1] == df.animal_.values.tolist()[:-1] assert ds.animal_.values.tolist()[-1] is None # Make assertions - compare to the expected values for numeric type # assert ds.number_.values.tolist() == (np.array(number)/10).tolist() assert ds.floats_.values.tolist()[:-1] == (np.array(floats)/-10.).tolist()[:-1] assert ds.floats_.values.tolist()[-1] == -4 # missing keys with pytest.raises(ValueError): ds.colour.map({'ret': 1, 'blue': 2, 'green': 3}) with pytest.raises(ValueError): ds.colour.map({'blue': 2, 'green': 3}) # missing keys but user-handled ds['colour_unmapped'] = ds.colour.map({'blue': 2, 'green': 3}, default_value=-1) assert ds.colour_unmapped.values.tolist() == [-1, -1, 2, -1, 3, 3, -1, 2, 2, 3] # extra is ok ds.colour.map({'red': 1, 'blue': 2, 'green': 3, 'orange': 4}) # check masked arrays # import pdb; pdb.set_trace() assert ds.colour.map({'blue': 2, 'green': 3}, allow_missing=True).tolist() == [None, None, 2, None, 3, 3, None, 2, 2, 3]
def test_timedelta_methods(): delta = np.array([187201, 1449339, 11264958, -181614], dtype='timedelta64[s]') df = vaex.from_arrays(delta=delta) pdf = pd.DataFrame({'delta': pd.Series(delta, dtype=delta.dtype)}) assert df.delta.td.days.tolist() == pdf.delta.dt.days.tolist() assert df.delta.td.seconds.tolist() == pdf.delta.dt.seconds.tolist() assert df.delta.td.microseconds.tolist( ) == pdf.delta.dt.microseconds.tolist() assert df.delta.td.nanoseconds.tolist() == pdf.delta.dt.nanoseconds.tolist( ) assert df.delta.td.total_seconds().tolist() == pdf.delta.dt.total_seconds( ).tolist()
def test_count_cat(lazy): ds0 = vaex.from_arrays(colors=['red', 'green', 'blue', 'green'], names=['apple', 'apple', 'berry', 'apple']) ds = ds0.ordinal_encode('colors', ['red', 'green', 'blue'], lazy=lazy) assert ds.count(binby=ds.colors).tolist() == [1, 2, 1] ds = ds0.ordinal_encode('colors', [ 'red', 'blue', 'green', ], inplace=True, lazy=lazy) assert ds.count(binby=ds.colors).tolist() == [1, 1, 2]
def test_dropinf(): x = [1, 2, np.inf] y = [10, -np.inf, 2] z = [1, 2, 3] df = vaex.from_arrays(x=x, y=y, z=z) df_filter = df.dropinf() df_filter.shape == (1, 3) df_filter.values.tolist() == [[1.0, 10.0, 1.0]] df_filter = df.dropinf(column_names=['x']) df_filter.shape == (2, 3) df_filter.values.tolist() == [[1.0, 10.0, 1.0], [2.0, -np.inf, 2.0]]
def test_join_filtered_inner(): df_a_filtered = df_a[df_a.y > 0] df_joined = df_a_filtered.join(other=df_b, on='x', how='inner', rsuffix='_', allow_duplication=True) assert len(df_joined) == len(df_a_filtered) x = np.arange(20) df = vaex.from_arrays(x=x, y=x**2) df = df[df.x > 5] dfj = df.join(df, on='x', rsuffix='right_', how='inner') repr(dfj) # trigger issue with selection cache
def test_string_operations_from_mmap_file(tmpdir): # if we write the file to disk and mmap it read only, we trigger invalid memory writes # see https://github.com/vaexio/vaex/pull/459 x = np.arange(5) y = np.array(['This', 'is', 'a', None, 'test']) df = vaex.from_arrays(x=x, y=y) filename = str(tmpdir / 'test.hdf5') df.export_hdf5(filename) df_from_file = vaex.open(filename) assert df_from_file.y.str.slice( start=0, stop=2).tolist() == ['Th', 'is', 'a', None, 'te'] assert df_from_file.y.str.upper().tolist() == [ 'THIS', 'IS', 'A', None, 'TEST' ]
def test_groupby_same_result(): h = np.array([0, 0, 0, 1, 1, 1, 1, 2, 2, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4], dtype=int) df = vaex.from_arrays(h=h) # Compare value_counts with the groupby counts for the hour column vc = df.h.value_counts() with small_buffer(df): group = df.groupby(by=df.h).agg({'h': 'count'}) # second time it uses a new set, this caused a bug # see https://github.com/vaexio/vaex/pull/233 group = df.groupby(by=df.h).agg({'h': 'count'}) group_sort = group.sort(by='count', ascending=False) assert vc.values.tolist() == group_sort['count'].values.tolist(), 'counts are not correct.' assert vc.index.tolist() == group_sort['h'].values.tolist(), 'the indices of the counts are not correct.'
def from_dict(data): """Create an in memory dataset from a dict with column names as keys and list/numpy-arrays as values Example >>> data = {'A':[1,2,3],'B':['a','b','c']} >>> vaex.from_dict(data) # A B 0 1 'a' 1 2 'b' 2 3 'c' :param data: A dict of {columns:[value, value,...]} :rtype: DataFrame """ return vaex.from_arrays(**data)
def test_groupby_options(): t = np.arange('2015-01-01', '2015-02-01', dtype=np.datetime64) y = np.arange(len(t)) sum_answer = [y[k*7:(k+1)*7].sum() for k in range(5)] mean_answer = [y[k*7:(k+1)*7].mean() for k in range(5)] df = vaex.from_arrays(t=t, y=y) by = vaex.BinnerTime.per_week(df.t) dfg = df.groupby(by, agg={'y': 'sum'}) assert dfg.y.tolist() == sum_answer dfg = df.groupby(by, agg={'y': vaex.agg.sum}) assert dfg.y.tolist() == sum_answer dfg = df.groupby(by, agg={'z': vaex.agg.sum('y')}) assert dfg.z.tolist() == sum_answer dfg = df.groupby(by, agg=[vaex.agg.sum('y')]) assert dfg.y_sum.tolist() == sum_answer dfg = df.groupby(by, agg=[vaex.agg.sum('y'), vaex.agg.mean('y')]) assert dfg.y_sum.tolist() == sum_answer assert dfg.y_mean.tolist() == mean_answer dfg = df.groupby(by, agg={'z': [vaex.agg.sum('y'), vaex.agg.mean('y')]}) assert dfg.z_sum.tolist() == sum_answer assert dfg.z_mean.tolist() == mean_answer # default is to do all columns dfg = df.groupby(by, agg=[vaex.agg.sum, vaex.agg.mean]) assert dfg.y_sum.tolist() == sum_answer assert dfg.y_mean.tolist() == mean_answer dfg = df.groupby(by, agg=vaex.agg.sum) assert dfg.y_sum.tolist() == sum_answer assert "t_sum" not in dfg.get_column_names() dfg = df.groupby(by).agg({'y': 'sum'}) assert dfg.y.tolist() == [y[k*7:(k+1)*7].sum() for k in range(5)] dfg = df.groupby(by).agg({'y': 'sum'}) assert dfg.y.tolist() == [y[k*7:(k+1)*7].sum() for k in range(5)] dfg = df.groupby(by, 'sum') assert dfg.y_sum.tolist() == sum_answer
def test_map(): # Generate the test data colour = ['red', 'red', 'blue', 'red', 'green', 'green', 'red', 'blue', 'blue', 'green'] animal = np.array(['dog', 'cat', 'dog', 'dog', 'dog', 'dog', 'cat', 'dog', 'dog', np.nan], dtype='O') number = [10, 20, 30, 10, 20, 30, 30, 30, 10, 20] floats = [10., 20., 30., 10., 20., 30., 30., 30., 10., np.nan] ds = vaex.from_arrays(colour=colour, animal=animal, number=number, floats=floats) df = pd.DataFrame(data=np.array([colour, animal, number, floats]).T, columns=['colour', 'animal', 'number', 'floats']) # Create a mapper - dictionary mapper = {} mapper['colour'] = {'red': 1, 'blue': 2, 'green': 3} mapper['animal'] = {'dog': 5, 'cat': -1, 'dolphin': 0} mapper['number'] = {10: 1, 20: 2, 30: 3} mapper['floats'] = {10.: -1, 20.: -2, 30.: -3, np.nan: -4} # Map the functions in vaex ds['colour_'] = ds.colour.map(mapper['colour']) ds['animal_'] = ds.animal.map(mapper['animal']) # ds['number_'] = ds.number.map(lambda x: mapper['number'][x]) # test with a function, not just with a dict ds['floats_'] = ds.floats.map(mapper['floats'], nan_mapping=np.nan) # Map in pandas df['colour_'] = df.colour.map(mapper['colour']) df['animal_'] = df.animal.map(mapper['animal']) # Make assertions - compare to pandas for string columns # we deviate from pandas, we can map nan to something assert ds.colour_.values.tolist()[:-1] == df.colour_.values.tolist()[:-1] assert ds.animal_.values.tolist()[:-1] == df.animal_.values.tolist()[:-1] assert ds.animal_.values[-1] is None # Make assertions - compare to the expected values for numeric type # assert ds.number_.values.tolist() == (np.array(number)/10).tolist() assert ds.floats_.values.tolist()[:-1] == (np.array(floats)/-10.).tolist()[:-1] assert ds.floats_.values.tolist()[-1] == -4 # missing keys with pytest.raises(ValueError): ds.colour.map({'ret': 1, 'blue': 2, 'green': 3}) with pytest.raises(ValueError): ds.colour.map({'blue': 2, 'green': 3}) # extra is ok ds.colour.map({'red': 1, 'blue': 2, 'green': 3, 'orange': 4})
def test_big_endian_binning(): x = np.arange(10, dtype='>f8') y = np.zeros(10, dtype='>f8') ds = vaex.from_arrays(x=x, y=y) counts = ds.count(binby=[ds.x, ds.y], limits=[[-0.5, 9.5], [-0.5, 0.5]], shape=[10, 1]) assert counts.ravel().tolist() == np.ones(10).tolist()
def test_categorize(): ds0 = vaex.from_arrays(c=[0, 1, 1, 3]) ds0.categorize('c', ['a', 'b', 'c', 'd']) assert ds0.is_category(ds0.c) assert ds0.category_labels(ds0.c) == ['a', 'b', 'c', 'd'] assert ds0.category_count(ds0.c) == 4
parser.add_argument('--number', "-n", dest="n", type=float, default=7, help="log number of rows to use") parser.add_argument('--nmax', type=int, default=9, help="number of rows for test dataset") parser.add_argument('--partitions', type=int, default=multiprocessing.cpu_count()*2, help="number of partitions to split (default: 2x number cores)") parser.add_argument('--npandas', dest="npandas", type=float, default=7, help="number of rows to use for pandas") parser.add_argument('--filter', dest="filter", default=None, help="filter for benchmark") parser.add_argument('--filename', default=default_filename, help='filename to use for benchmark export/reading') args = parser.parse_args(argv[1:]) use_dask = False if not os.path.exists(args.filename): x = np.arange(0, int(10**args.nmax)) xs = x.astype(str) s = xs#vaex.string_column(xs) df_vaex = vaex.from_arrays(x=s, s=s) df_vaex.export(args.filename, progress=True, shuffle=True) df = vaex.open(args.filename) df_vaex = df[0:int(10**args.n)] df_vaex.executor.buffer_size = len(df_vaex)//args.partitions df_pandas = df[:int(10**args.npandas)].to_pandas_df() if use_dask: df_dask = dd.from_pandas(df_pandas, npartitions=4) timings = {} def mytimeit(expr, N, scope): times = [] for i in range(N): t0 = time.time() eval(expr, scope)
def test_map_to_string(): df = vaex.from_arrays(type=[0, 1, 2, 2, 2, np.nan]) df['role'] = df['type'].map({0: 'admin', 1: 'maintainer', 2: 'user', np.nan: 'unknown'}) assert df['role'].tolist() == ['admin', 'maintainer', 'user', 'user', 'user', 'unknown']