def test_string(): df = vaex.from_dict({"A": ["a", None, "cdef", "", "g"]}) col = df.__dataframe__().get_column_by_name("A") assert col._col.tolist() == df.A.tolist() assert col.size == 5 assert col.null_count == 1 assert col.dtype[0] == _DtypeKind.STRING assert col.describe_null == (3, 0) df2 = _from_dataframe_to_vaex(df.__dataframe__()) assert df2.A.tolist() == df.A.tolist() assert df2.__dataframe__().get_column_by_name("A").null_count == 1 assert df2.__dataframe__().get_column_by_name("A").describe_null == (3, 0) assert df2.__dataframe__().get_column_by_name( "A").dtype[0] == _DtypeKind.STRING df_sliced = df[1:] col = df_sliced.__dataframe__().get_column_by_name("A") assert col.size == 4 assert col.null_count == 1 assert col.dtype[0] == _DtypeKind.STRING assert col.describe_null == (3, 0) df2 = _from_dataframe_to_vaex(df_sliced.__dataframe__()) assert df2.A.tolist() == df_sliced.A.tolist() assert df2.__dataframe__().get_column_by_name("A").null_count == 1 assert df2.__dataframe__().get_column_by_name("A").describe_null == (3, 0) assert df2.__dataframe__().get_column_by_name( "A").dtype[0] == _DtypeKind.STRING
def create(**arrays): def try_convert(ar): try: return array_factory_arrow_chunked(ar) except: return ar return vaex.from_dict({k: try_convert(v) for k, v in arrays.items()})
def create(**arrays): def try_convert(ar): try: return pa.array(ar) except: return ar return vaex.from_dict({k: try_convert(v) for k, v in arrays.items()})
def test_apply_with_invalid_identifier(): df = vaex.from_dict({"#": [1], "with space": [2]}) def add(a, b): return a + b assert df.apply(add, arguments=[df["#"], df["with space"]]).tolist() == [3]
def from_records(records: List[Dict], array_type="arrow", defaults={}) -> vaex.dataframe.DataFrame: '''Create a dataframe from a list of dict. .. warning:: This is for convenience only, for performance pass arrays to :func:`from_arrays` for instance. :param str array_type: {array_type} :param dict defaults: default values if a record has a missing entry ''' arrays = dict() for i, record in enumerate(records): for name, value in record.items(): if name not in arrays: # prepend None's arrays[name] = [defaults.get(name)] * i arrays[name].append(value) for name in arrays: if name not in record: # missing values get replaced arrays[name].append(defaults.get(name)) arrays = { k: vaex.array_types.convert(v, array_type) for k, v in arrays.items() } return vaex.from_dict(arrays)
def test_combined_grouper_over64bit(): bits = [15, 16, 17] * 2 assert sum(bits) > 64 N = 2**max(bits) def unique_ints(offset, bit): # create 2**bits unique ints ar = np.full(N, offset, dtype='int32') n = 2**bit ar[:n] = np.arange(offset, offset + n) return ar arrays = {f'x_{i}': unique_ints(i, bit) for i, bit in enumerate(bits)} names = list(arrays) df = vaex.from_dict(arrays) grouper = df.groupby(names) dfg = grouper.agg('count') for i, bit in enumerate(bits): xi = dfg[f'x_{i}'].to_numpy() assert len(xi) == N xiu = np.unique(xi) Ni = 2**bits[i] assert len(xiu) == Ni assert dfg['count'].sum() == N with pytest.raises(vaex.RowLimitException, match='.* >= 2 .*'): df.groupby(names, row_limit=2) with pytest.raises(vaex.RowLimitException): df.groupby([names[0]], row_limit=2**bits[0] - 1)
def process(_ignore): logger.info(f"extracing indices of parent groupers ({self.N:,} unique rows)") df = vaex.from_dict({'row': vaex.vrange(0, self.N, dtype='i8'), 'bin_value': self.bin_values}) df[f'index_0'] = df['bin_value'] // multipliers[0] df[f'leftover_0'] = df[f'bin_value'] % multipliers[0] for i in range(1, len(multipliers)): df[f'index_{i}'] = df[f'leftover_{i-1}'] // multipliers[i] df[f'leftover_{i}'] = df[f'leftover_{i-1}'] % multipliers[i] columns = [f'index_{i}' for i in range(len(multipliers))] indices_parents = df.evaluate(columns, progress=progressbar) def compress(ar): if vaex.dtype_of(ar).kind == 'i': ar = vaex.array_types.to_numpy(ar) max_value = ar.max() ar = ar.astype(vaex.utils.required_dtype_for_max(max_value)) return ar indices_parents = [compress(ar) for ar in indices_parents] bin_values = {} logger.info(f"extracing labels of parent groupers...") # NOTE: we can also use dict encoding instead of take for indices, parent in zip(indices_parents, parents): if sort: assert parent.pre_sort, "cannot sort while parent not presorted" assert parent.sort_indices is None dtype = vaex.dtype_of(parent.bin_values) if dtype.is_struct: # collapse parent struct into our flat struct for field, ar in zip(parent.bin_values.type, parent.bin_values.flatten()): bin_values[field.name] = ar.take(indices) # bin_values[field.name] = pa.DictionaryArray.from_arrays(indices, ar) else: bin_values[parent.label] = parent.bin_values.take(indices) # bin_values[parent.label] = pa.DictionaryArray.from_arrays(indices, parent.bin_values) logger.info(f"extracing labels of parent groupers done") return pa.StructArray.from_arrays(bin_values.values(), bin_values.keys())
def agg(self, actions): # TODO: this basically forms a cartesian product, we can do better, use a # 'multistage' hashmap arrays = super(GroupBy, self)._agg(actions) # we don't want non-existing pairs (e.g. Amsterdam in France does not exist) counts = self.counts if counts is None: # nobody wanted to know count*, but we need it count_agg = vaex.agg.count(edges=True) counts = self.df._agg(count_agg, self.grid, delay=_USE_DELAY) self.df.execute() if _USE_DELAY: arrays = {key: value.get() for key, value in arrays.items()} counts = counts.get() # take out the edges arrays = { key: vaex.utils.extract_central_part(value) for key, value in arrays.items() } counts = vaex.utils.extract_central_part(counts) mask = counts > 0 coords = [ coord[mask] for coord in np.meshgrid(*self.coords1d, indexing='ij') ] labels = { str(by.expression): coord for by, coord in zip(self.by, coords) } df_grouped = vaex.from_dict(labels) for key, value in arrays.items(): df_grouped[key] = value[mask] return df_grouped
def update_flow_figures(days, hours, zone): logger.info( 'Figure: update sankey and sunburst for days=%r hours=%r zone=%r', days, hours, zone) flow_data = compute_flow_data(days, hours, zone) df_outflow_top = vaex.from_dict(flow_data['outflow_top']) df_outflow_rest = vaex.from_dict(flow_data['outflow_rest']) df_outflow_borough = vaex.from_dict(flow_data['outflow_borough']) pickup_zone = zone fig_sankey = create_figure_sankey(df_outflow_top, df_outflow_rest, df_outflow_borough, pickup_zone) fig_sunburst = create_figure_sunburst(df_outflow_top, df_outflow_rest, df_outflow_borough, pickup_zone) table_records, table_style = create_table_data(df_outflow_top) return fig_sankey, fig_sunburst, table_records, table_style, 'trigger loader'
def _append_listed_dict_to_df(self, data, check_unique=True): """Append pre-processed dict to self._df. Args: data (dict): data to add check_unique (bool): if True, it will be checked that the data is unique in the db """ if self._df is None: self._df = vaex.from_dict(data) else: if check_unique: # TODO: support unique-check for multiple items df_uuid = data['uuid_in_df'][0] if len(self.df[self.df.uuid_in_df.str.equals(df_uuid)]) > 0: logging.warning('Given data already exist in dataframe: {}'.format(df_uuid)) return self.df = self.df.concat(vaex.from_dict(data))
class MlTest(TestCase): model_df = vaex.from_dict({ "route_id": array(["60-155-d12-1"], dtype=object), "start_date": array([20210302]), "start_time": array(["19:20:00"], dtype=object), "stop_sequence": array([24]), "arrival": array([5.0]), "timestamp": array(["2021-03-02 19:51:26"], dtype=object), "stop_id": array(["8220DB000264"], dtype=object), "arrival_time": array(["19:38:38"], dtype=object), "shape_dist_traveled": array([7818.16]), "direction": array(["0"], dtype=object), "route_id": array(["60-155-d12-1"], dtype=object), "lat": array([53.3535353]), "lon": array([-6.26225863]), "direction_angle": array([139.31470635]), "shape_dist_between": array([518.6]), "arr_dow": array([1]), "arr_hour": array([19]), "arrival_mean": array([6.0]), "p_mean_vol": array([68.53864425]), }) def test_files(self): [ self.assertEqual(os.path.exists(p), True) for p in [ gtfsr_historical_means_path, stop_time_data_path, gtfsr_model_path ] ] def test_model(self): self.model_df.state_load(gtfsr_model_path) pred_val = self.model_df[["p_arrival_lgbm"]][0][0] self.assertTrue(pred_val)
def test_ipython_autocompletion(ds_local): df = vaex.from_dict({ 'First name': ['Reggie', 'Tamika'], 'Last name': ['Miller', 'Catchings'], '$amount': [10, 20] }) completions = df._ipython_key_completions_() assert 'First name' in completions assert 'Last name' in completions assert '$amount' in completions assert 'Team' not in completions
def test_add_invalid_name(tmpdir): # support invalid names and keywords df = vaex.from_dict({'X!1': x, 'class': x * 2}) assert df['X!1'].tolist() == x.tolist() assert (df['X!1'] * 2).tolist() == (x * 2).tolist() assert (df['class']).tolist() == (x * 2).tolist() assert 'X!1' in df._column_aliases assert (df.copy()['X!1'] * 2).tolist() == (x * 2).tolist() path = str(tmpdir.join('test.hdf5')) df.export(path) df = vaex.open(path) assert df['X!1'].tolist() == x.tolist() assert (df.copy()['X!1'] * 2).tolist() == (x * 2).tolist()
def _test_df_to_vaex(): """Convert pandas dataframe to vaex.""" def _serialize(element): if isinstance(element, list): return ':'.join(element) return element with open("test/small_dflist.pkl", "rb") as f: df_dict = pickle.load(f) # load Pandas DataFrame and Serialize content_ = df_dict["content_df"].applymap(_serialize).to_dict('list') file_ = df_dict["file_df"].applymap(_serialize).to_dict('list') record_id_ = df_dict["record_id_df"].applymap(_serialize).to_dict('list') # Create Vaex DataFrame content_df = vaex.from_dict(content_) file_df = vaex.from_dict(file_) record_id_df = vaex.from_dict(record_id_) # Export as .arrow content_df.export('test/content_df.arrow') file_df.export('test/file_df.arrow') record_id_df.export('test/record_id_df.arrow')
def test_invalid_name_read(tmpdir): # earlier version of vaex could write invalid names, check if we can read those df = vaex.from_dict({'x': x}) # df.columns['1'] = df.columns.pop('x') # df.column_names = ['1'] path = str(tmpdir.join('test.hdf5')) df.export(path) h5 = h5py.File(path) h5['/table/columns']['1'] = h5['/table/columns']['x'] del h5['/table/columns']['x'] df = vaex.open(path) assert df['1'].tolist() == x.tolist() assert (df.copy()['1'] * 2).tolist() == (x * 2).tolist()
def test_groupby_datetime(): data = {'z': [2, 4, 8, 10], 't': [np.datetime64('2020-01-01'), np.datetime64('2020-01-01'), np.datetime64('2020-02-01'), np.datetime64('2020-02-01')] } df = vaex.from_dict(data) dfg = df.groupby(by='t', sort=True).agg({'z': 'mean'}) assert dfg.column_count() == 2 assert dfg.z.tolist() == [3, 9] assert dfg.t.dtype.is_datetime assert set(dfg.t.tolist()) == {datetime.date(2020, 1, 1), datetime.date(2020, 2, 1)}
def test_non_identifiers(): df = vaex.from_dict({'x': [1], 'y': [2], '#':[1]}) df['z'] = df['#'] + 1 assert df['z'].variables() == {'#'} assert df._virtual_expressions['z'].variables() == {'#'} df['1'] = df.x * df.y df['2'] = df['1'] + df.x assert df['1'].variables(ourself=True) == {'x', 'y', '1'} assert df['1'].variables() == {'x', 'y'} assert df['2'].variables(ourself=True) == {'x', 'y', '2', '1'} assert df['2'].variables(include_virtual=False) == {'x', 'y'} df['valid'] = df['2'] assert df['valid'].variables(ourself=True) == {'x', 'y', '2', '1', 'valid'} assert df['valid'].variables(include_virtual=False) == {'x', 'y'}
def __init__(self, expression, df, multipliers, parents, sort, row_limit=None): '''Will group by 1 expression, which is build up from multiple expressions. Used in the sparse/combined group by. ''' super().__init__(expression, df, sort=sort, row_limit=row_limit) assert len(multipliers) == len(parents) assert multipliers[-1] == 1 self.df = df self.label = 'SHOULD_NOT_BE_USED' self.expression = expression # efficient way to find the original bin values (parent.bin_value) from the 'compressed' # self.bin_values df = vaex.from_dict({ 'row': vaex.vrange(0, self.N, dtype='i8'), 'bin_value': self.bin_values }) df[f'index_0'] = df['bin_value'] // multipliers[0] df[f'leftover_0'] = df[f'bin_value'] % multipliers[0] for i in range(1, len(multipliers)): df[f'index_{i}'] = df[f'leftover_{i-1}'] // multipliers[i] df[f'leftover_{i}'] = df[f'leftover_{i-1}'] % multipliers[i] columns = [f'index_{i}' for i in range(len(multipliers))] indices_parents = df.evaluate(columns) bin_values = {} for indices, parent in zip(indices_parents, parents): dtype = vaex.dtype_of(parent.bin_values) if dtype.is_struct: # collapse parent struct into our flat struct for field, ar in zip(parent.bin_values.type, parent.bin_values.flatten()): bin_values[field.name] = ar.take(indices) else: bin_values[parent.label] = parent.bin_values.take(indices) self.bin_values = pa.StructArray.from_arrays(bin_values.values(), bin_values.keys())
def __init__(self, expression, df, multipliers, parents, sort, row_limit=None): '''Will group by 1 expression, which is build up from multiple expressions. Used in the sparse/combined group by. ''' super().__init__(expression, df, sort=sort, row_limit=row_limit) assert len(multipliers) == len(parents) assert multipliers[-1] == 1 self.df = df self.label = 'SHOULD_NOT_BE_USED' self.expression = expression # efficient way to find the original bin values (parent.bin_value) from the 'compressed' # self.bin_values df = vaex.from_dict({'row': vaex.vrange(0, self.N, dtype='i8'), 'bin_value': self.bin_values}) df[f'index_0'] = df['bin_value'] // multipliers[0] df[f'leftover_0'] = df[f'bin_value'] % multipliers[0] for i in range(1, len(multipliers)): df[f'index_{i}'] = df[f'leftover_{i-1}'] // multipliers[i] df[f'leftover_{i}'] = df[f'leftover_{i-1}'] % multipliers[i] columns = [f'index_{i}' for i in range(len(multipliers))] indices_parents = df.evaluate(columns) def compress(ar): if vaex.dtype_of(ar).kind == 'i': ar = vaex.array_types.to_numpy(ar) max_value = ar.max() ar = ar.astype(vaex.utils.required_dtype_for_max(max_value)) return ar indices_parents = [compress(ar) for ar in indices_parents] bin_values = {} # NOTE: we can also use dict encoding instead of take for indices, parent in zip(indices_parents, parents): dtype = vaex.dtype_of(parent.bin_values) if dtype.is_struct: # collapse parent struct into our flat struct for field, ar in zip(parent.bin_values.type, parent.bin_values.flatten()): bin_values[field.name] = ar.take(indices) # bin_values[field.name] = pa.DictionaryArray.from_arrays(indices, ar) else: bin_values[parent.label] = parent.bin_values.take(indices) # bin_values[parent.label] = pa.DictionaryArray.from_arrays(indices, parent.bin_values) self.bin_values = pa.StructArray.from_arrays(bin_values.values(), bin_values.keys())
def _from_dataframe_to_vaex(df: DataFrameObject) -> vaex.dataframe.DataFrame: """ Note: we need to implement/test support for bit/byte masks, chunk handling, etc. """ # Iterate through the chunks dataframe = [] _buffers = [] for chunk in df.get_chunks(): # We need a dict of columns here, with each column being an expression. columns = dict() _k = _DtypeKind _buffers_chunks = [] # hold on to buffers, keeps memory alive for name in chunk.column_names(): if not isinstance(name, str): raise ValueError(f"Column {name} is not a string") if name in columns: raise ValueError(f"Column {name} is not unique") col = chunk.get_column_by_name(name) if col.dtype[0] in (_k.INT, _k.UINT, _k.FLOAT, _k.BOOL): # Simple numerical or bool dtype, turn into arrow array columns[name], _buf = convert_column_to_ndarray(col) elif col.dtype[0] == _k.CATEGORICAL: columns[name], _buf = convert_categorical_column(col) elif col.dtype[0] == _k.STRING: columns[name], _buf = convert_string_column(col) else: raise NotImplementedError(f"Data type {col.dtype[0]} not handled yet") _buffers_chunks.append(_buf) dataframe.append(vaex.from_dict(columns)) # chunk buffers are added to list of all buffers _buffers.append(_buffers_chunks) if df.num_chunks() == 1: _buffers = _buffers[0] df_new = vaex.concat(dataframe) df_new._buffers = _buffers return df_new
def test_add_invalid_name(tmpdir): # support invalid names and keywords df = vaex.from_dict({'X!1': x, 'class': x*2}) assert str(df['X!1']) != 'X!1', "invalid identifier cannot be an expression" assert str(df['class']) != 'class', "keyword cannot be an expression" assert df.get_column_names() == ['X!1', 'class'] assert df['X!1'].tolist() == x.tolist() assert (df['X!1']*2).tolist() == (x*2).tolist() assert (df['class']).tolist() == (x*2).tolist() assert (df.copy()['X!1']*2).tolist() == (x*2).tolist() path = str(tmpdir.join('test.hdf5')) df.export(path) df = vaex.open(path) assert df['X!1'].tolist() == x.tolist() assert (df.copy()['X!1']*2).tolist() == (x*2).tolist() assert (df[['X!1']]['X!1']*2).tolist() == (x*2).tolist() df_concat = vaex.concat([df, df]) assert (df_concat[['X!1']]['X!1']*2).tolist() == ((x*2).tolist() + (x*2).tolist())
def test_add_invalid_name(tmpdir): # support invalid names and keywords df = vaex.from_dict({'X!1': x, 'class': x*2}) assert df.get_column_names() == ['X!1', 'class'] assert df.get_column_names(alias=False) != ['X!1', 'class'] assert df['X!1'].tolist() == x.tolist() assert (df['X!1']*2).tolist() == (x*2).tolist() assert (df['class']).tolist() == (x*2).tolist() assert 'X!1' in df._column_aliases assert (df.copy()['X!1']*2).tolist() == (x*2).tolist() path = str(tmpdir.join('test.hdf5')) df.export(path) df = vaex.open(path) assert df['X!1'].tolist() == x.tolist() assert (df.copy()['X!1']*2).tolist() == (x*2).tolist() assert (df[['X!1']]['X!1']*2).tolist() == (x*2).tolist() df_concat = vaex.concat([df, df]) assert (df_concat[['X!1']]['X!1']*2).tolist() == ((x*2).tolist() + (x*2).tolist())
def predict(self, instances, **kwargs): if isinstance(instances[0], list): data = np.asarray(instances).T df = vaex.from_arrays(Arrival_Time=data[0], Creation_Time=data[1], x=data[2], y=data[3], z=data[4]) elif isinstance(instances[0], dict): dfs = [] for instance in instances: df = vaex.from_dict(instance) dfs.append(df) df = vaex.concat(dfs) else: return ['invalid input format'] df.state_set(self.state, set_filter=False) return df.pred_name.tolist()
def predict(data: Data): instances = data.instances if isinstance(instances[0], list): data = np.asarray(instances).T df = vaex.from_arrays(Arrival_Time=data[0], Creation_Time=data[1], x=data[2], y=data[3], z=data[4]) elif isinstance(instances[0], dict): dfs = [] for instance in instances: df = vaex.from_dict(instance) dfs.append(df) df = vaex.concat(dfs) else: return {'predictions': 'invalid input format'} df.state_set(global_items['state'], set_filter=False) return {'predictions': df.pred_name.tolist()}
def agg(self, actions): # TODO: this basically forms a cartesian product, we can do better, use a # 'multistage' hashmap arrays = super(GroupBy, self)._agg(actions) # we don't want non-existing pairs (e.g. Amsterdam in France does not exist) count_agg = vaex.agg.count() counts = self.df._agg(count_agg, self.grid, delay=_USE_DELAY) self.df.execute() if _USE_DELAY: arrays = {key: value.get() for key, value in arrays.items()} counts = counts.get() # take out the edges arrays = {key: vaex.utils.extract_central_part(value) for key, value in arrays.items()} counts = vaex.utils.extract_central_part(counts) mask = counts > 0 coords = [coord[mask] for coord in np.meshgrid(*self.coords1d, indexing='ij')] labels = {str(by.expression): coord for by, coord in zip(self.by, coords)} df_grouped = vaex.from_dict(labels) for key, value in arrays.items(): df_grouped[key] = value[mask] for key, value in arrays.items(): df_grouped[key] = value[mask] return df_grouped
def test_from_dict(): data = {'A': [1, 2, 3], 'B': ['a', 'b', 'c']} ds = vaex.from_dict(data) assert 'A' in ds.get_column_names() assert ds['A'].values[0] == 1 assert ds['B'].values[2] == 'c'
def vaex_vertices_from_plyfile(filename): """Load vertices from plyfile and return as vaex DataFrame.""" xyz = vertex_dict_from_plyfile(filename) return vx.from_dict(xyz)
def make_prediction(data): st_df = MlConfig.st_df # stop_time_data hm_df = MlConfig.hm_df # historical means dataset model = MlConfig.state_model # GTFSR vaex model state empty = ("", "") if not "start_time" in data or not "start_date" in data: return empty formatted_data = { "route_id": [str(data["route_id"])], "direction": [int(data["direction"])], "stop_sequence": [int(data["stop_sequence"])], "stop_id": [str(data["stop_id"])], "start_time": [str(data["start_time"])], "start_date": [int(data["start_date"])], "timestamp": [str(data["timestamp"])], "arrival": [int(data["arrival"] / 60)], } live_df = vaex.from_dict(formatted_data) live_df["arr_dow"] = live_df.start_date.apply( lambda d: get_dt(d, "%Y%m%d").weekday()) live_df.materialize("arr_dow", inplace=True) # print(live_df.dtypes, "\n", st_df.dtypes, "\n", hm_df.dtypes, "\n") temp_df = st_df[ (st_df["route_id"] == live_df[["route_id"]][0][0]) & (st_df["stop_sequence"] == live_df[["stop_sequence"]][0][0]) & (st_df["stop_id"] == live_df[["stop_id"]][0][0]) & (st_df["start_time"] == live_df[["start_time"]][0][0]) & (st_df["direction"] == live_df[["direction"]][0][0])].copy() if len(temp_df) < 1: return empty # join stop time data, filtering improves speed by only copying relevant rows cols = ["route_id", "stop_sequence", "stop_id", "start_time", "direction"] live_df = vaex_mjoin(live_df, temp_df, cols, cols, how="inner", allow_duplication=True) live_df["keep_trip"] = live_df.apply( lambda sd, dow: sd.replace("[", "").replace("]", "").replace(" ", ""). split(",")[dow], ["service_days", "arr_dow"], ) live_df = live_df[live_df.keep_trip == "True"] live_df.drop(["service_days", "keep_trip"], inplace=True) if len(live_df) < 1: return empty live_df["arr_hour"] = live_df["arrival_time"].apply( lambda t: get_dt(t, "%H:%M:%S").hour) live_df.materialize("arr_hour", inplace=True) # join the historical means to our dataset temp_df = hm_df[(hm_df["route_id"] == data["route_id"]) & (hm_df["stop_id"] == data["stop_id"]) & (hm_df["arr_dow"] == live_df[["arr_dow"]][0][0]) & (hm_df["arr_hour"] == live_df[["arr_hour"]][0][0]) & (hm_df["direction"] == int(data["direction"])) & (hm_df["stop_sequence"] == live_df[["stop_sequence" ]][0][0])].copy() if len(temp_df) < 1: return empty cols = [ "route_id", "stop_id", "arr_dow", "arr_hour", "direction", "stop_sequence" ] live_df = vaex_mjoin( live_df, temp_df, cols, cols, how="inner", ) if len(live_df) < 1: return empty # assert same type live_df["direction"] = live_df["direction"].astype("int64") live_df["shape_dist_traveled"] = live_df["shape_dist_traveled"].astype( "float64") live_df["lat"] = live_df["lat"].astype("float64") live_df["lon"] = live_df["lon"].astype("float64") live_df["direction_angle"] = live_df["direction_angle"].astype("float64") live_df["shape_dist_between"] = live_df["shape_dist_between"].astype( "float64") # materialize virtual columns to match model state [ live_df.materialize(col, inplace=True) for col in live_df.get_column_names() if not col in live_df.get_column_names(virtual=False) ] try: live_df.state_set(model) if len(live_df) > 0: return (round(live_df[["p_arrival_lgbm"]][0][0]) * 60), live_df[["p_arrival_lgbm"]][0][0] except: return empty return empty
def test_unicode_names(): x = np.arange(10) df = vaex.from_dict({'远': x}) assert df.远.tolist() == x.tolist()
def test_not_hide_invalid_name(): x = np.arange(10) df = vaex.from_dict({'./bla': x}) assert len(df.get_column_names()) == 1 assert df['./bla'].tolist() == x.tolist()
def test_hdf5_with_alias(tmpdir): df = vaex.from_dict({'X-1': [1], '#': [2]}) path = DATA_PATH / 'with_alias.hdf5' df = vaex.open(str(path)) assert df['X-1'].tolist() == [1] assert df['#'].tolist() == [2]
def test_random_projections(n_components, matrix_type): df = vaex.from_dict(data=data_maker(n_rows=100_000, n_cols=31))