def to_table(np_array: np.ndarray, cols: List[str]) -> Table: """ Creates a new table from a numpy array. Args: np_array (np.ndarray): the numpy array cols (List[str]): the table column names that will be assigned to each column in the numpy array Returns: a Deephaven table Raise: DHError """ try: _, *dims = np_array.shape if dims: if not cols or len(cols) != dims[0]: raise DHError( message=f"the number of array columns {dims[0]} doesn't match " f"the number of column names {len(cols)}") input_cols = [] if len(cols) == 1: input_cols.append(_make_input_column(cols[0], np_array)) else: for i, col in enumerate(cols): input_cols.append(_make_input_column(col, np_array[:, [i]])) return new_table(cols=input_cols) except DHError: raise except Exception as e: raise DHError(e, "failed to create a Deephaven Table from a Pandas DataFrame.") from e
def to_table(df: pandas.DataFrame, cols: List[str] = None) -> Table: """ Creates a new table from a pandas.DataFrame. Args: df (DataFrame): the Pandas DataFrame instance cols (List[str]): the dataframe column names, default is None which means including all columns in the dataframe Returns: a Deephaven table Raise: DHError """ try: if not cols: cols = list(df) else: diff_set = set(cols) - set(list(df)) if diff_set: raise DHError(message=f"columns - {list(diff_set)} not found") input_cols = [] for col in cols: input_cols.append(_make_input_column(col, df.get(col).values)) return new_table(cols=input_cols) except DHError: raise except Exception as e: raise DHError( e, "failed to create a Deephaven Table from a Pandas DataFrame." ) from e
def __call__(self, *args, **kwargs): if self.is_primitive: raise DHError( message=f"primitive type {self.j_name} is not callable.") try: return self.j_type(*args, **kwargs) except Exception as e: raise DHError( e, f"failed to create an instance of {self.j_name}") from e
def array(dtype: DType, seq: Sequence, remap: Callable[[Any], Any] = None) -> jpy.JType: """ Creates a Java array of the specified data type populated with values from a sequence. Note: this method does unsafe casting, meaning precision and values might be lost with down cast Args: dtype (DType): the component type of the array seq (Sequence): a sequence of compatible data, e.g. list, tuple, numpy array, Pandas series, etc. remap (optional): a callable that takes one value and maps it to another, for handling the translation of special DH values such as NULL_INT, NAN_INT between Python and the DH engine Returns: a Java array Raises: DHError """ try: if remap: if not callable(remap): raise ValueError("Not a callable") seq = [remap(v) for v in seq] else: if isinstance(seq, str) and dtype == char: return array(char, seq, remap=ord) return jpy.array(dtype.j_type, seq) except Exception as e: raise DHError(e, f"failed to create a Java {dtype.j_name} array.") from e
def cleanup(): try: runnable.run() except Exception as ex: raise DHError( ex, "failed to stop publishing to Kafka and the clean-up." ) from ex
def __init__(self, host: Optional[str] = None, port: Optional[int] = None, jvm_args: Optional[List[str]] = None, dh_args: Dict[str, str] = {}): """ Creates a Deephaven embedded server. Only one instance can be created at this time. """ # TODO deephaven-core#2453 consider providing @dataclass for arguments # If the server was already created, emit an error to warn away from trying again if Server.instance is not None: from deephaven import DHError raise DHError('Cannot create more than one instance of the server') # given the jvm args, ensure that the jvm has started start_jvm(jvm_args=jvm_args) # it is now safe to import jpy import jpy # Create a wrapped java server that we can reference to talk to the platform self.j_server = jpy.get_type( 'io.deephaven.python.server.EmbeddedServer')(host, port, dh_args) # Keep a reference to the server so we know it is running Server.instance = self
def query_performance(eval_number: int) -> Table: """ Takes in a query evaluation number and returns a view for that query's performance data. You can obtain query evaluation numbers, which uniquely identify a query and its subqueries, via the performance data tables obtained from calling query_performance_log() or query_operation_performance_log() The query performance log contains data on how long each query takes to run. Examples of what constitutes one individual query, for performance logging purposes, include: * A new command in the console (i.e. type something, then press the return key) * A sort, filter, or custom column generated by a UI * A call from a client API external application Args: eval_number (int): the evaluation number Returns: a Table of query performance data Raises: DHError """ try: return Table(j_table=_JPerformanceQueries.queryPerformance(eval_number)) except Exception as e: raise DHError(e, "failed to obtain the query performance data.") from e
def start(self, do_replay: bool = False, replay_lock: str = "shared") -> None: """Start the listener by registering it with the table and listening for updates. Args: do_replay (bool): whether to replay the initial snapshot of the table, default is False replay_lock (str): the lock type used during replay, default is 'shared', can also be 'exclusive'. Raises: DHError """ if self.started: raise RuntimeError( "Attempting to start an already started listener..") try: def _start(): if do_replay: self.listener.replay() self.t.j_table.listenForUpdates(self.listener) if do_replay: _do_locked(_start, lock_type=replay_lock) else: _start() except Exception as e: raise DHError(e, "failed to listen to the table changes.") from e self.started = True
def exact_join(self, table: Table, on: Union[str, Sequence[str]], joins: Union[str, Sequence[str]] = None) -> Table: """The exact_join method creates a new table containing all of the rows and columns of this table plus additional columns containing data from the right table. For columns appended to the left table (joins), row values equal the row values from the right table where the key values in the left and right tables are equal. Args: table (Table): the right-table of the join on (Union[str, Sequence[str]]): the column(s) to match, can be a common name or an equal expression, i.e. "col_a = col_b" for different column names joins (Union[str, Sequence[str]], optional): the column(s) to be added from the right table to the result table, can be renaming expressions, i.e. "new_col = col"; default is None Returns: a new table Raises: DHError """ try: on = to_sequence(on) joins = to_sequence(joins) if joins: return Table(j_table=self.j_table.exactJoin( table.j_table, ",".join(on), ",".join(joins))) else: return Table(j_table=self.j_table.exactJoin( table.j_table, ",".join(on))) except Exception as e: raise DHError(e, "table exact_join operation failed.") from e
def join(self, table: Table, on: Union[str, Sequence[str]] = None, joins: Union[str, Sequence[str]] = None) -> Table: """The join method creates a new table containing rows that have matching values in both tables. Rows that do not have matching criteria will not be included in the result. If there are multiple matches between a row from the left table and rows from the right table, all matching combinations will be included. If no columns to match (on) are specified, every combination of left and right table rows is included. Args: table (Table): the right-table of the join on (Union[str, Sequence[str]]): the column(s) to match, can be a common name or an equal expression, i.e. "col_a = col_b" for different column names; default is None joins (Union[str, Sequence[str]], optional): the column(s) to be added from the right table to the result table, can be renaming expressions, i.e. "new_col = col"; default is None Returns: a new table Raises: DHError """ try: on = to_sequence(on) joins = to_sequence(joins) if joins: return Table(j_table=self.j_table.join(table.j_table, ",".join( on), ",".join(joins))) else: return Table( j_table=self.j_table.join(table.j_table, ",".join(on))) except Exception as e: raise DHError(e, "table join operation failed.") from e
def snapshot_history(self, source_table: Table) -> Table: """Produces an in-memory history of a source table that adds a new snapshot when this table (trigger table) changes. The trigger table is often a time table that adds new rows at a regular, user-defined interval. Columns from the trigger table appear in the result table. If the trigger and source tables have columns with the same name, an error will be raised. To avoid this problem, rename conflicting columns. Because snapshot_history stores a copy of the source table for every trigger event, large source tables or rapidly changing trigger tables can result in large memory usage. Args: source_table (Table): the table to be snapshot Returns: a new table Raises: DHError """ try: return Table( j_table=self.j_table.snapshotHistory(source_table.j_table)) except Exception as e: raise DHError( message="failed to create a snapshot history table.") from e
def consume_raw( kafka_config: dict, cdc_spec: CDCSpec, partitions=None, table_type: TableType = TableType.stream(), ) -> Table: """ Consume the raw events from a Change Data Capture (CDC) Kafka stream to a Deephaven table. Args: kafka_config (Dict): configuration for the associated kafka consumer and also the resulting table. Passed to the org.apache.kafka.clients.consumer.KafkaConsumer constructor; pass any KafkaConsumer specific desired configuration here. Note this should include the relevant property for a schema server URL where the key and/or value Avro necessary schemas are stored. cdc_spec (CDCSpec): a CDCSpec obtained from calling either the cdc_long_spec or the cdc_short_spec function partitions (List[int]: a list of integer partition numbers, default is None indicating all partitions table_type (TableType): a TableType enum, default is TableType.stream() Returns: a Deephaven live table for the raw CDC events Raises: DHError """ try: partitions = j_partitions(partitions) kafka_config = j_properties(kafka_config) table_type_enum = table_type.value return Table(j_table=_JCdcTools.consumeRawToTable( kafka_config, cdc_spec.j_object, partitions, table_type_enum)) except Exception as e: raise DHError(e, "failed to consume a raw CDC stream.") from e
def query_operation_performance(eval_number: int) -> Table: """ Takes in a query evaluation number and returns a view for that query's individual operation's performance data. You can obtain query evaluation numbers, which uniquely identify a query and its subqueries, via the performance data tables obtained from calling query_performance_log() or query_operation_performance_log() The query operation performance log contains data on how long each individual operation of a query (where(), update(), naturalJoin(), etc., as well as internal functions) takes to execute, and the change in resource consumption while each was executing. Args: eval_number (int): the evaluation number Returns: a table of query operation performance data Raises: DHError """ try: return Table(j_table=_JPerformanceQueries.queryOperationPerformance( eval_number)) except Exception as e: raise DHError( e, "failed to obtain the query operation performance data.") from e
def __init__(self, width: float = 1.0, end_style: LineEndStyle = LineEndStyle.ROUND, join_style: LineJoinStyle = LineJoinStyle.ROUND, dash_pattern: List[Number] = None): """ Creates a LineStyle object. Args: width (float): the width of the line, default is 1.0 end_style (LineEndStyle): the end style of the line, default is LineEndStyle.ROUND join_style (LineJoinStyle: the join style of the line, default is LineJoinStyle.ROUND dash_pattern (List[Number]): a list of number specifying the dash pattern of the line Raises: DHError """ try: if dash_pattern: self.j_line_style = _JLineStyle.lineStyle(width, end_style.value, join_style.value, *dash_pattern) else: self.j_line_style = _JLineStyle.lineStyle(width, end_style.value, join_style.value, None) self.width = width self.end_style = end_style self.join_style = join_style self.dash_pattern = dash_pattern except Exception as e: raise DHError(e, "failed to create a LineStyle.") from e
def time_window(table: Table, ts_col: str, window: int, bool_col: str) -> Table: """Creates a new table by applying a time window to the source table and adding a new Boolean column. The value of the new Boolean column is set to false when the timestamp column value is older than the window from now or true otherwise. If the timestamp column value is null, the Boolean column value will be null as well. The result table ticks whenever the source table ticks, or modifies a row when it passes out of the window. Args: table (Table): the source table ts_col (str): the timestamp column name window (int): the size of the window in nanoseconds bool_col (str): the name of the new Boolean column. Returns: a new Table Raises: DHError """ try: return Table(j_table=_JWindowCheck.addTimeWindow( table.j_table, ts_col, window, bool_col)) except Exception as e: raise DHError(e, "failed to create a time window table.") from e
def _column_to_numpy_array(col_def: Column, j_array: jpy.JType) -> np.ndarray: """ Produces a numpy array from the given Java array and the Table column definition. """ try: if col_def.data_type.is_primitive: np_array = np.frombuffer(j_array, col_def.data_type.np_type) elif col_def.data_type == dtypes.DateTime: longs = _JPrimitiveArrayConversionUtility.translateArrayDateTimeToLong(j_array) np_long_array = np.frombuffer(longs, np.int64) np_array = np_long_array.view(col_def.data_type.np_type) np_array[:] = np_long_array elif col_def.data_type == dtypes.bool_: bytes_ = _JPrimitiveArrayConversionUtility.translateArrayBooleanToByte(j_array) np_array = np.frombuffer(bytes_, col_def.data_type.np_type) elif col_def.data_type.np_type is not np.object_: try: np_array = np.frombuffer(j_array, col_def.data_type.np_type) except: np_array = np.array(j_array, np.object_) else: np_array = np.array(j_array, np.object_) return np_array except DHError: raise except Exception as e: raise DHError(e, f"failed to create a numpy array for the column {col_def.name}") from e
def one_click_table_map( tm: jpy.JType, t: Table, by: List[str] = None, require_all_filters: bool = False) -> SelectableDataSet: """ Creates a SelectableDataSet with the specified columns from the table map. Args: tm (jpy.JType): the source table map t (Table): the source table by (List[str]): the selected columns require_all_filters (bool): false to display data when not all oneclicks are selected; true to only display data when appropriate oneclicks are selected Returns: a SelectableDataSet Raises: DHError """ if not by: by = [] try: return SelectableDataSet(j_sds=_JSelectables.oneClick( tm, t.j_table, require_all_filters, *by)) except Exception as e: raise DHError(e, "failed in one_click.") from e
def read(path: str, col_instructions: List[ColumnInstruction] = None, is_legacy_parquet: bool = False) -> Table: """ Reads in a table from a single parquet, metadata file, or directory with recognized layout. Args: path (str): the file or directory to examine col_instructions (List[ColumnInstruction]): instructions for customizations while reading is_legacy_parquet (bool): if the parquet data is legacy Returns: a table Raises: DHError """ try: read_instructions = _build_parquet_instructions( col_instructions=col_instructions, is_legacy_parquet=is_legacy_parquet, for_read=True) if read_instructions: return Table( j_table=_JParquetTools.readTable(path, read_instructions)) else: return Table(j_table=_JParquetTools.readTable(path)) except Exception as e: raise DHError(e, "failed to read parquet data.") from e
def _assert_type(name: str, obj: Any, types: List) -> None: """Assert that the input object is of the proper type. Args: name (str): name of the variable being converted to Java obj (Any): object being converted to Java types (List): acceptable types for the object Raises: DHError """ types_no_subscript = tuple( set(t.__origin__ if isinstance(t, _GenericAlias) else t for t in types)) if not isinstance(obj, types_no_subscript): supported = [ t._name if isinstance(t, _GenericAlias) else t.__name__ for t in types_no_subscript ] raise DHError( message= f"Improper input type: name={name} type={type(obj)} supported={supported}" )
def snapshot(self, source_table: Table, do_init: bool = False, cols: Union[str, List[str]] = None) -> Table: """Produces an in-memory copy of a source table that refreshes when this table changes. Note, this table is often a time table that adds new rows at a regular, user-defined interval. Args: source_table (Table): the table to be snapshot do_init (bool): whether to snapshot when this method is initially called, default is False cols (Union[str, List[str]]): names of the columns of this table to be included in the snapshot, default is None, meaning all the columns Returns: a new table Raises: DHError """ try: cols = to_sequence(cols) return Table(j_table=self.j_table.snapshot(source_table.j_table, do_init, *cols)) except Exception as e: raise DHError(message="failed to create a snapshot table.") from e
def cdc_short_spec(server_name: str, db_name: str, table_name: str): """ Creates a CDCSpec in the debezium style from the provided server name, database name and table name. The topic name, and key and value schema names are implied by convention: - Topic is the concatenation of the arguments using "." as separator. - Key schema name is topic with a "-key" suffix added. - Value schema name is topic with a "-value" suffix added. Args: server_name (str): the server_name configuration value used when the CDC Stream was created db_name (str): the database name configuration value used when the CDC Stream was created table_name (str): the table name configuration value used when the CDC Stream was created Returns: a CDCSpec Raises: DHError """ try: return CDCSpec( j_spec=_JCdcTools.cdcShortSpec(server_name, db_name, table_name)) except Exception as e: raise DHError(e, "failed to create a CDC spec in cdc_short_spec.") from e
def cdc_long_spec( topic: str, key_schema_name: str, key_schema_version: str, value_schema_name: str, value_schema_version: str, ) -> CDCSpec: """ Creates a CDCSpec with all the required configuration options. Args: topic (str): the Kafka topic for the CDC events associated to the desired table data. key_schema_name (str): the schema name for the Key Kafka field in the CDC events for the topic. This schema should include definitions for the columns forming the PRIMARY KEY of the underlying table. This schema name will be looked up in a schema server. key_schema_version (str): the version for the Key schema to look up in schema server. None or "latest" implies using the latest version when Key is not ignored. value_schema_name (str): the schema name for the Value Kafka field in the CDC events for the topic. This schema should include definitions for all the columns of the underlying table. This schema name will be looked up in a schema server. value_schema_version (str): the version for the Value schema to look up in schema server. None or "latest" implies using the latest version. Returns: a CDCSpec Raises: DHError """ try: return CDCSpec(j_spec=_JCdcTools.cdcLongSpec( topic, key_schema_name, key_schema_version, value_schema_name, value_schema_version)) except Exception as e: raise DHError(e, "failed to create a CDC spec in cdc_long_spec.") from e
def to_pandas(table: Table, cols: List[str] = None) -> pandas.DataFrame: """ Produces a pandas.DataFrame from a table. Note that the **entire table** is going to be cloned into memory, so the total number of entries in the table should be considered before blindly doing this. For large tables, consider using the Deephaven query language to select a subset of the table **before** using this method. Args: table (Table): the source table cols (List[str]): the source column names, default is None which means include all columns Returns: pandas.DataFrame Raise: DHError """ try: if table.is_refreshing: table = freeze_table(table) col_def_dict = {col.name: col for col in table.columns} if not cols: cols = list(col_def_dict.keys()) else: diff_set = set(cols) - set(col_def_dict.keys()) if diff_set: raise DHError(message=f"columns - {list(diff_set)} not found") data = {} for col in cols: series = _column_to_series(table, col_def_dict[col]) data[col] = series dtype_set = set([v.dtype for k, v in data.items()]) if len(dtype_set) == 1: return pandas.DataFrame(data=np.stack( [v.array for k, v in data.items()], axis=1), columns=cols, copy=False) else: return pandas.DataFrame(data=data, columns=cols, copy=False) except DHError: raise except Exception as e: raise DHError(e, "failed to create a Pandas DataFrame from table.") from e
def produce( table: Table, kafka_config: Dict, topic: str, key_spec: KeyValueSpec, value_spec: KeyValueSpec, last_by_key_columns: bool = False, ) -> Callable[[], None]: """Produce to Kafka from a Deephaven table. Args: table (Table): the source table to publish to Kafka kafka_config (Dict): configuration for the associated kafka producer topic (str): the topic name key_spec (KeyValueSpec): specifies how to map table column(s) to the Key field in produced Kafka messages. This should be the result of calling one of the functions simple_spec(), avro_spec() or json_spec() in this module, or the constant KeyValueSpec.IGNORE value_spec (KeyValueSpec): specifies how to map table column(s) to the Value field in produced Kafka messages. This should be the result of calling one of the functions simple_spec(), avro_spec() or json_spec() in this, or the constant KeyValueSpec.IGNORE last_by_key_columns (bool): whether to publish only the last record for each unique key, Ignored if key_spec is KeyValueSpec.IGNORE. Otherwise, if last_by_key_columns is true this method will internally perform a last_by aggregation on table grouped by the input columns of key_spec and publish to Kafka from the result. Returns: a callback that, when invoked, stops publishing and cleans up subscriptions and resources. Users should hold to this callback to ensure liveness for publishing for as long as this publishing is desired, and once not desired anymore they should invoke it Raises: DHError """ try: if key_spec is KeyValueSpec.IGNORE and value_spec is KeyValueSpec.IGNORE: raise ValueError( "at least one argument for 'key_spec' or 'value_spec' must be different from KeyValueSpec.IGNORE" ) kafka_config = j_properties(kafka_config) runnable = _JKafkaTools.produceFromTable( table.j_table, kafka_config, topic, key_spec.j_object, value_spec.j_object, last_by_key_columns, ) def cleanup(): try: runnable.run() except Exception as ex: raise DHError( ex, "failed to stop publishing to Kafka and the clean-up." ) from ex return cleanup except Exception as e: raise DHError(e, "failed to start producing Kafka messages.") from e
def to_numpy(table: Table, cols: List[str] = None) -> np.ndarray: """ Produces a numpy array from a table. Note that the **entire table** is going to be cloned into memory, so the total number of entries in the table should be considered before blindly doing this. For large tables, consider using the Deephaven query language to select a subset of the table **before** using this method. Args: table (Table): the source table cols (List[str]): the source column names, default is None which means include all columns Returns: a numpy ndarray Raise: DHError """ try: if table.is_refreshing: table = freeze_table(table) col_def_dict = {col.name: col for col in table.columns} if not cols: cols = list(col_def_dict.keys()) else: diff_set = set(cols) - set(col_def_dict.keys()) if diff_set: raise DHError(message=f"columns - {list(diff_set)} not found") col_defs = [col_def_dict[col] for col in cols] if len(set([col_def.data_type for col_def in col_defs])) != 1: raise DHError(message="columns must be of the same data type.") j_arrays = [] for col_def in col_defs: data_col = table.j_table.getColumn(col_def.name) j_arrays.append(data_col.getDirect()) return _columns_to_2d_numpy_array(col_defs[0], j_arrays) except DHError: raise except Exception as e: raise DHError( e, "failed to create a Numpy array from the table column.") from e
def table_to_numpy_2d(row_set, col_set, order: MemoryLayout = MemoryLayout.ROW_MAJOR, np_type: Type = np.intc) -> np.ndarray: """ Converts Deephaven table data to a 2d NumPy array of the appropriate size Args: row_set: a RowSequence describing the number of rows in the table col_set: ColumnSources describing which columns to copy order (MemoryLayout): the desired memory layout of the output array np_type: the desired NumPy data type of the output NumPy array Returns a np.ndarray Raises: DHError """ try: np_type = _convert_to_numpy_dtype(np_type) if np_type == np.byte: buffer = _JGatherer.tensorBuffer2DByte(row_set, col_set, order.is_row_major) elif np_type == np.short: buffer = _JGatherer.tensorBuffer2DShort(row_set, col_set, order.is_row_major) elif np_type == np.intc: buffer = _JGatherer.tensorBuffer2DInt(row_set, col_set, order.is_row_major) elif np_type == np.int_: buffer = _JGatherer.tensorBuffer2DLong(row_set, col_set, order.is_row_major) elif np_type == np.single: buffer = _JGatherer.tensorBuffer2DFloat(row_set, col_set, order.is_row_major) elif np_type == np.double: buffer = _JGatherer.tensorBuffer2DDouble(row_set, col_set, order.is_row_major) else: raise ValueError(f"Data type {np_type} is not supported.") tensor = np.frombuffer(buffer, dtype=np_type) if order.is_row_major: tensor.shape = (len(col_set), row_set.intSize()) return tensor.T else: tensor.shape = (row_set.intSize(), len(col_set)) return tensor except Exception as e: raise DHError( e, f"failed to convert rows: {row_set} and cols: {col_set} to a 2D NumPy array" ) from e
def get_app_state(): """ Get the current application state object. Raises: DHError """ try: return ApplicationState(j_app_state=_JApplicationContext.get()) except Exception as e: raise DHError(e, "failed to get the application state.") from e
def start(self) -> None: """Starts replaying. Raises: DHError """ try: self._j_replayer.start() except Exception as e: raise DHError(e, "failed to start the replayer.") from e
def close(self) -> None: """Closes the writer. Raises: DHError """ try: self._j_table_writer.close() except Exception as e: raise DHError(e, "failed to close the writer.") from e
def layout_hints(self, front: Union[str, List[str]] = None, back: Union[str, List[str]] = None, freeze: Union[str, List[str]] = None, hide: Union[str, List[str]] = None) -> Table: """ Sets layout hints on the Table Args: front (Union[str, List[str]]): the columns to show at the front back (Union[str, List[str]]): the columns to show at the back freeze (Union[str, List[str]]): the columns to freeze to the front. These will not be affected by horizontal scrolling. hide (Union[str, List[str]]): the columns to hide Returns: a new table with the layout hints set Raises: DHError """ try: _j_layout_hint_builder = _JLayoutHintBuilder.get() if front is not None: _j_layout_hint_builder.atFront(to_sequence(front)) if back is not None: _j_layout_hint_builder.atEnd(to_sequence(back)) if freeze is not None: _j_layout_hint_builder.freeze(to_sequence(freeze)) if hide is not None: _j_layout_hint_builder.hide(to_sequence(hide)) except Exception as e: raise DHError(e, "failed to create layout hints") from e try: return Table(j_table=self.j_table.setLayoutHints( _j_layout_hint_builder.build())) except Exception as e: raise DHError(e, "failed to set layout hints on table") from e