def test_concat_functions(): def foo(a, b): return a + b df1 = vaex.from_scalars(x=1, y=2) df2 = vaex.from_scalars(x=2, y=3) df1.add_function('foo', foo) df2.add_function('foo', foo) # w has same expression and function df1['w'] = df1.func.foo(df1.x, df1.y) df2['w'] = df2.func.foo(df2.x, df2.y) assert df1.w.tolist() == [3] df = vaex.concat([df1, df2]) assert df.w.tolist() == [1 + 2, 2 + 3] # now bar is a new function def bar1(a, b): return a + b def bar2(a, b): return a + b df1 = vaex.from_scalars(x=1, y=2) df2 = vaex.from_scalars(x=2, y=3) df1.add_function('bar', bar1) df2.add_function('bar', bar2) with pytest.raises(ValueError): df = vaex.concat([df1, df2])
def test_multiple_tasks_different_columns_names(): df1 = vaex.from_scalars(x=1, y=2) df2 = vaex.from_scalars(x=1, y=2) x = df1.sum('x', delay=True) y = df2.sum('y', delay=True) df1.execute() assert x.get() == 1 assert y.get() == 2
def test_concat_keep_virtual(): df1 = vaex.from_scalars(x=1, y=2) df2 = vaex.from_scalars(x=2, y=3) # w has same expression df1['w'] = df1.x + df1.y df2['w'] = df2.x + df2.y df = vaex.concat([df1, df2]) assert 'w' in df.virtual_columns assert 'w' not in df.get_column_names(virtual=False) assert 'w' not in df.dataset
def test_passes_two_datasets(): df1 = vaex.from_scalars(x=1, y=2) df2 = vaex.from_scalars(x=1, y=3) executor = df1.executor executor.passes = 0 df1.sum('x') assert executor.passes == 1 df1.sum('x', delay=True) df2.sum('x', delay=True) df1.execute() assert executor.passes == 3
def test_concat_unequals_virtual_columns(): df1 = vaex.from_scalars(x=1, y=2) df2 = vaex.from_scalars(x=2, y=3) # w has same expression df1['w'] = df1.x + df1.y df2['w'] = df2.x + df2.y # z does not df1['z'] = df1.x + df1.y df2['z'] = df2.x * df2.y df = vaex.concat([df1, df2]) assert df.w.tolist() == [1 + 2, 2 + 3] assert df.z.tolist() == [1 + 2, 2 * 3]
def test_concat_unequals_virtual_columns(): ds1 = vaex.from_scalars(x=1, y=2) ds2 = vaex.from_scalars(x=2, y=3) # w has same expression ds1['w'] = ds1.x + ds1.y ds2['w'] = ds2.x + ds2.y # z does not ds1['z'] = ds1.x + ds1.y ds2['z'] = ds2.x * ds2.y ds = vaex.concat([ds1, ds2]) assert ds.w.tolist() == [1 + 2, 2 + 3] assert ds.z.tolist() == [1 + 2, 2 * 3]
def test_join_functions(): df1 = vaex.from_scalars(j=444, x=1, y=2) df2 = vaex.from_scalars(k=555, x=1) # df2['x'] = df2.apply(lambda y: y-1, arguments=[df2.y]) df2['z'] = df2.apply(lambda x: x + 10, arguments=[df1.x]) df = df1.join(df2, on='x') assert 'lambda_function' in df.get_names() assert df.x.tolist() == [1] assert df.y.tolist() == [2] assert df.z.tolist() == [11] assert df.j.tolist() == [444] assert df.k.tolist() == [555]
def test_propagate_uncertainty(): ds = vaex.from_scalars(x=1, y=2, e_x=2, e_y=4) ds['r'] = ds.x + ds.y ds.propagate_uncertainties([ds.r]) print(ds.r_uncertainty.expression) assert ds.r_uncertainty.expand( ).expression == 'sqrt(((e_x ** 2) + (e_y ** 2)))'
def test_expression_expand(): ds = vaex.from_scalars(x=1, y=2) ds['g'] = ds.x assert ds.g.expression == 'g' assert ds.g.variables() == {'x'} # TODO: this doesn't work, because outself and include_virtual contradict eachother # but we don't use this interally # assert ds.g.variables(ourself=True, include_virtual=False) == {'g', 'x'} ds['r'] = ds.x * ds.y assert ds.r.expression == 'r' assert ds.r.variables() == {'x', 'y'} assert ds.r.variables(ourself=True, include_virtual=False) == {'r', 'x', 'y'} ds['s'] = ds.r + ds.x assert ds.s.variables() == {'r', 'x', 'y'} assert ds.s.variables(ourself=True) == {'s', 'r', 'x', 'y'} assert ds.s.variables(include_virtual=False) == {'x', 'y'} assert ds.s.variables(ourself=True, include_virtual=False) == {'s', 'x', 'y'} ds['t'] = ds.s + ds.y assert ds.t.variables() == {'s', 'r', 'x', 'y'} ds['u'] = np.arctan(ds.t) assert ds.u.variables() == {'t', 's', 'r', 'x', 'y'}
def test_join_virtual_columns(on): df1 = vaex.from_scalars(j=444, x=1, y=2) df1['z'] = df1.x + df1.y df1['__h'] = df1.z * 2 df2 = vaex.from_scalars(j=444, x=2, yy=3) df2['z'] = df2.x + df2.yy df2['__h'] = df2.z * 3 df = df1.join(df2, rprefix='r_', rsuffix='_rhs', on=on) assert df.x.values[0] == 1 assert df.y.values[0] == 2 assert df.z.values[0] == 3 assert df.__h.values[0] == 6 assert df.r_x_rhs.values[0] == 2 assert df.yy.values[0] == 3 assert df.r_z_rhs.values[0] == 5 assert df.__r_h_rhs.values[0] == 15
def test_matrix(): ds = vaex.from_scalars(x=1, y=0, z=0, x_e=0.1, y_e=0.2, z_e=0.3) matrix = [[1, 0, 0], [0, 1, 0], [0, 0, 1]] ds.add_virtual_columns_matrix3d(ds.x, ds.y, ds.z, 'xn', 'yn', 'zy', matrix) ds.propagate_uncertainties([ds.xn]) assert ds.xn.values[0] == ds.x.values[0] assert ds.xn_uncertainty.values[0] == ds.x_e.values[0] ds = vaex.from_scalars(x=1, y=0, z=0, x_e=0.1, y_e=0.2, z_e=0.3) matrix = [[0, 1, 0], [1, 0, 0], [0, 0, 1]] ds.add_virtual_columns_matrix3d(ds.x, ds.y, ds.z, 'xn', 'yn', 'zy', matrix) ds.propagate_uncertainties([ds.xn, ds.yn]) assert ds.xn.values[0] == ds.y.values[0] assert ds.xn_uncertainty.values[0] == ds.y_e.values[0] assert ds.yn.values[0] == ds.x.values[0] assert ds.yn_uncertainty.values[0] == ds.x_e.values[0]
def test_matrix(): ds = vaex.from_scalars(x=1, y=0, z=0, x_e=0.1, y_e=0.2, z_e=0.3) matrix = [[1, 0, 0], [0, 1, 0], [0, 0, 1]] ds.add_virtual_columns_matrix3d(ds.x, ds.y, ds.z, 'xn', 'yn', 'zy', matrix) ds.propagate_uncertainties([ds.xn]) assert ds.xn.values[0] == ds.x.values[0] assert ds.xn_uncertainty.values[0] == ds.x_e.values[0] ds = vaex.from_scalars(x=1, y=0, z=0, x_e=0.1, y_e=0.2, z_e=0.3) matrix = [[0, 1, 0], [1, 0, 0], [0, 0, 1]] ds.add_virtual_columns_matrix3d(ds.x, ds.y, ds.z, 'xn', 'yn', 'zy', matrix) ds.propagate_uncertainties([ds.xn, ds.yn]) assert ds.xn.values[0] == ds.y.values[0] assert ds.xn_uncertainty.values[0] == ds.y_e.values[0] assert ds.yn.values[0] == ds.x.values[0] assert ds.yn_uncertainty.values[0] == ds.x_e.values[0]
def test_join_variables(): df1 = vaex.from_scalars(j=444, x=1, y=2) df1.add_variable('a', 2) df1.add_variable('b', 3) df1['z'] = df1.x * df1['a'] + df1.y * df1['b'] df2 = vaex.from_scalars(j=444, x=2, yy=3) df2.add_variable('a', 3) df2.add_variable('b', 4) df2['z'] = df2.x * df2['a'] + df2.yy * df2['b'] df = df1.join(df2, rprefix='r_', rsuffix='_rhs') assert df.x.values[0] == 1 assert df.y.values[0] == 2 assert df.z.values[0] == 2 + 2 * 3 # assert df.__h.values[0] == 6 assert df.r_x_rhs.values[0] == 2 assert df.yy.values[0] == 3 assert df.r_z_rhs.values[0] == 2 * 3 + 3 * 4
def test_column_list_traitlets(): df = vaex.from_scalars(x=1, y=2) df['z'] = df.x + df.y column_list = vt.ColumnsMixin(df=df) assert len(column_list.columns) == 3 df['w'] = df.z * 2 assert len(column_list.columns) == 4 del df['w'] assert len(column_list.columns) == 3
def test_expression_expand(): ds = vaex.from_scalars(x=1, y=2) ds['r'] = ds.x * ds.y assert ds.r.expression == 'r' assert ds.r.expand().expression == '(x * y)' ds['s'] = ds.r + ds.x assert ds.s.expand().expression == '((x * y) + x)' ds['t'] = ds.s + ds.y assert ds.t.expand(stop=['r']).expression == '((r + x) + y)' ds['u'] = np.arctan2(ds.s, ds.y) assert ds.u.expand(stop=['r']).expression == 'arctan2((r + x), y)'
def test_expression_expand(): ds = vaex.from_scalars(x=1, y=2) ds['r'] = ds.x * ds.y assert ds.r.expression == 'r' assert ds.r.variables() == {'x', 'y'} ds['s'] = ds.r + ds.x assert ds.s.variables() == {'x', 'y'} ds['t'] = ds.s + ds.y assert ds.t.variables() == {'x', 'y'} ds['u'] = np.arctan(ds.t) assert ds.u.variables() == {'x', 'y'}
def test_nested_use_of_executor(): df = vaex.from_scalars(x=1, y=2) @vaex.delayed def next(x): # although the exector is still in its look, it's not using the threads anymore # so we should be able to use the executor again return x + df.y.sum() value = next(df.x.sum(delay=True)) df.execute() assert value.get() == 1 + 2
def dfs(alpha, delta, pm_a, pm_d, radians=radians): ds_1 = vaex.from_scalars(alpha=alpha, delta=delta, pm_a=pm_a, pm_d=pm_d, alpha_e=0.01, delta_e=0.02, pm_a_e=0.003, pm_d_e=0.004) ds_1 = ds_1.astro.pm_eq2gal("alpha", "delta", "pm_a", "pm_d", "pm_l", "pm_b", propagate_uncertainties=True, radians=radians) N = 100000 # distance alpha = np.random.normal(0, 0.01, N) + alpha delta = np.random.normal(0, 0.02, N) + delta pm_a = np.random.normal(0, 0.003, N) + pm_a pm_d = np.random.normal(0, 0.004, N) + pm_d ds_many = vaex.from_arrays(alpha=alpha, delta=delta, pm_a=pm_a, pm_d=pm_d) ds_many.astro.pm_eq2gal("alpha", "delta", "pm_a", "pm_d", "pm_l", "pm_b", radians=radians, inplace=True) return ds_1, ds_many
def test_expression_expand(): ds = vaex.from_scalars(x=1, y=2) ds['r'] = ds.x * ds.y assert ds.r.expression == 'r' assert ds.r.variables() == {'x', 'y'} ds['s'] = ds.r + ds.x assert ds.s.variables() == {'r', 'x', 'y'} assert ds.s.variables(ourself=True) == {'s', 'r', 'x', 'y'} assert ds.s.variables(include_virtual=False) == {'x', 'y'} assert ds.s.variables(ourself=True, include_virtual=False) == {'s', 'x', 'y'} ds['t'] = ds.s + ds.y assert ds.t.variables() == {'s', 'r', 'x', 'y'} ds['u'] = np.arctan(ds.t) assert ds.u.variables() == {'t', 's', 'r', 'x', 'y'}
def test_virtual_columns_equatorial(): df = vaex.from_scalars(alpha=0, delta=0, distance=1) df.add_virtual_columns_equatorial_to_galactic_cartesian("alpha", "delta", "distance", "x", "y", "z", radians=False) df.add_virtual_column("r", "sqrt(x**2+y**2+z**2)") x = df['x'].values[0] y = df['y'].values[0] z = df['z'].values[0] assert x**2 + y**2 + z**2 == 1 assert df['r'].values[0] == 1
def test_selection_toggle_list(): df = vaex.from_scalars(x=1) widget = vaex.jupyter.widgets.SelectionToggleList(df=df) assert widget.selection_names == [] assert widget.value == [] df.select('x > 0') assert widget.selection_names == ['default'] assert widget.value == [] widget.value = ['default'] df.select('x < 0', name='neg') assert widget.selection_names == ['default', 'neg'] assert widget.value == ['default'] df.select_nothing('default') assert widget.selection_names == ['neg'] assert widget.value == [] df.select('x > 0') assert widget.selection_names == ['default', 'neg'] assert widget.value == [] widget.value = ['default', 'neg'] df.select_nothing('default') assert widget.value == ['neg'] df.select_nothing('neg') assert widget.value == []
def test_eq2gal(): df = vaex.from_scalars(ra=1, dec=2) df = df.astro.eq2gal() assert df.l.tolist() != 1 assert df.b.tolist() != 2
def test_invert(): df = vaex.from_scalars(x=1, y=2) df['r'] = ~(df.x > df.y) df.r.expand().expression == '~(x > y)'
def test_propagate_uncertainty(): ds = vaex.from_scalars(x=1, y=2, e_x=2, e_y=4) ds['r'] = ds.x + ds.y ds.propagate_uncertainties([ds.r]) print(ds.r_uncertainty.expression) assert ds.r_uncertainty.expand().expression == 'sqrt(((e_x ** 2) + (e_y ** 2)))'
def test_open_nonstandard_extension(tmpdir): df = vaex.from_scalars(x=1, s='Hello') df.export_hdf5(tmpdir / 'this_is_hdf5.xyz') df = vaex.open(tmpdir / 'this_is_hdf5.xyz') assert df.x.tolist() == [1] assert df.s.tolist() == ['Hello']
def compute_flow_data(days, hours, zone): logger.info("Compute: flow data: days=%r hours=%r zone=%r", days, hours, zone) df, selection = create_selection(days, hours) df.select(df.pickup_zone == zone, mode='and') selection = True df_flow_zone = df.groupby( [df.pickup_zone, df.dropoff_zone], agg={'count_trips': vaex.agg.count(selection=selection)}) # sort descending so we can take the top N df_flow_zone = df_flow_zone.sort('count_trips', ascending=False) df_flow_zone['pickup_borough'] = df_flow_zone.pickup_zone.map( zone_index_to_borough_index) df_flow_zone['dropoff_borough'] = df_flow_zone.dropoff_zone.map( zone_index_to_borough_index) pickup_zone = zone pickup_borough = zone_index_to_borough_index[pickup_zone] # Now to include the total count of all trips for zones that are not the top N # only trips leaving from this zone and to a different borough df_outflow_zone = df_flow_zone[(df_flow_zone.pickup_zone == pickup_zone)] df_outflow_zone = df_outflow_zone[ df_outflow_zone.dropoff_borough != pickup_borough] df_outflows_top = [] df_outflows_rest = [] for dropoff_borough in range(6): if dropoff_borough == pickup_borough: continue # outflow from this zone, to a particular borough df_outflow_zone_borough = df_outflow_zone[ df_outflow_zone.dropoff_borough == dropoff_borough] if len(df_outflow_zone_borough): n_max = min(len(df_outflow_zone_borough), n_largest) # top N zones of outflow from this zone, to a particular borough df_outflows_top.append(df_outflow_zone_borough[:n_max]) if len(df_outflow_zone_borough) > n_largest: count_other = df_outflow_zone_borough[n_largest:][ 'count_trips'].sum() # rest of the outflow from this zone, to a particular borough df_outflows_rest.append( vaex.from_scalars(pickup_borough=pickup_borough, dropoff_borough=dropoff_borough, dropoff_zone=len(zone_index_to_name) + dropoff_borough, count_trips=count_other)) df_outflow_top = vaex.concat(df_outflows_top) df_outflow_borough = df_outflow_zone.groupby( ['pickup_borough', 'dropoff_borough'], agg={'count_trips': vaex.agg.sum('count_trips')}) if df_outflows_rest: df_outflow_rest = vaex.concat(df_outflows_rest) else: # create an empy dataframe with the same schema to make the rest of the code simpler df_outflow_rest = vaex.from_scalars(pickup_borough=-1, dropoff_borough=-1, dropoff_zone=-1, count_trips=-1)[:0] # return as dict and lists so it can be serialized by the memoize decorator flow_data = dict( outflow_top=df_outflow_top.to_dict(array_type='list'), outflow_rest=df_outflow_rest.to_dict(array_type='list'), outflow_borough=df_outflow_borough.to_dict(array_type='list')) return flow_data