def read(path: str, col_instructions: List[ColumnInstruction] = None, is_legacy_parquet: bool = False) -> Table: """ Reads in a table from a single parquet, metadata file, or directory with recognized layout. Args: path (str): the file or directory to examine col_instructions (List[ColumnInstruction]): instructions for customizations while reading is_legacy_parquet (bool): if the parquet data is legacy Returns: a table Raises: DHError """ try: read_instructions = _build_parquet_instructions( col_instructions=col_instructions, is_legacy_parquet=is_legacy_parquet, for_read=True) if read_instructions: return Table( j_table=_JParquetTools.readTable(path, read_instructions)) else: return Table(j_table=_JParquetTools.readTable(path)) except Exception as e: raise DHError(e, "failed to read parquet data.") from e
def consume_raw( kafka_config: dict, cdc_spec: CDCSpec, partitions=None, table_type: TableType = TableType.stream(), ) -> Table: """ Consume the raw events from a Change Data Capture (CDC) Kafka stream to a Deephaven table. Args: kafka_config (Dict): configuration for the associated kafka consumer and also the resulting table. Passed to the org.apache.kafka.clients.consumer.KafkaConsumer constructor; pass any KafkaConsumer specific desired configuration here. Note this should include the relevant property for a schema server URL where the key and/or value Avro necessary schemas are stored. cdc_spec (CDCSpec): a CDCSpec obtained from calling either the cdc_long_spec or the cdc_short_spec function partitions (List[int]: a list of integer partition numbers, default is None indicating all partitions table_type (TableType): a TableType enum, default is TableType.stream() Returns: a Deephaven live table for the raw CDC events Raises: DHError """ try: partitions = j_partitions(partitions) kafka_config = j_properties(kafka_config) table_type_enum = table_type.value return Table(j_table=_JCdcTools.consumeRawToTable( kafka_config, cdc_spec.j_object, partitions, table_type_enum)) except Exception as e: raise DHError(e, "failed to consume a raw CDC stream.") from e
def time_window(table: Table, ts_col: str, window: int, bool_col: str) -> Table: """Creates a new table by applying a time window to the source table and adding a new Boolean column. The value of the new Boolean column is set to false when the timestamp column value is older than the window from now or true otherwise. If the timestamp column value is null, the Boolean column value will be null as well. The result table ticks whenever the source table ticks, or modifies a row when it passes out of the window. Args: table (Table): the source table ts_col (str): the timestamp column name window (int): the size of the window in nanoseconds bool_col (str): the name of the new Boolean column. Returns: a new Table Raises: DHError """ try: return Table(j_table=_JWindowCheck.addTimeWindow( table.j_table, ts_col, window, bool_col)) except Exception as e: raise DHError(e, "failed to create a time window table.") from e
def query_performance(eval_number: int) -> Table: """ Takes in a query evaluation number and returns a view for that query's performance data. You can obtain query evaluation numbers, which uniquely identify a query and its subqueries, via the performance data tables obtained from calling query_performance_log() or query_operation_performance_log() The query performance log contains data on how long each query takes to run. Examples of what constitutes one individual query, for performance logging purposes, include: * A new command in the console (i.e. type something, then press the return key) * A sort, filter, or custom column generated by a UI * A call from a client API external application Args: eval_number (int): the evaluation number Returns: a Table of query performance data Raises: DHError """ try: return Table(j_table=_JPerformanceQueries.queryPerformance(eval_number)) except Exception as e: raise DHError(e, "failed to obtain the query performance data.") from e
def query_operation_performance(eval_number: int) -> Table: """ Takes in a query evaluation number and returns a view for that query's individual operation's performance data. You can obtain query evaluation numbers, which uniquely identify a query and its subqueries, via the performance data tables obtained from calling query_performance_log() or query_operation_performance_log() The query operation performance log contains data on how long each individual operation of a query (where(), update(), naturalJoin(), etc., as well as internal functions) takes to execute, and the change in resource consumption while each was executing. Args: eval_number (int): the evaluation number Returns: a table of query operation performance data Raises: DHError """ try: return Table(j_table=_JPerformanceQueries.queryOperationPerformance( eval_number)) except Exception as e: raise DHError( e, "failed to obtain the query operation performance data.") from e
def server_state() -> Table: """ Returns a table of basic memory, update graph processor, and GC stats for the current engine process, sampled on a periodic basis. Returns: a table """ try: return Table(j_table=_JPerformanceQueries.serverState()) except Exception as e: raise DHError(e, "failed to produce a table with server state info.") from e
def time_table(period: str, start_time: str = None) -> Table: """Creates a table that adds a new row on a regular interval. Args: period (str): time interval between new row additions start_time (str): start time for adding new rows Returns: a Table Raises: DHError """ try: if start_time: return Table(j_table=_JTableTools.timeTable(start_time, period)) else: return Table(j_table=_JTableTools.timeTable(period)) except Exception as e: raise DHError(e, "failed to create a time table.") from e
def update_performance_log() -> Table: """ Returns a table with Deephaven update performance data. Returns a Table Raises: DHError """ try: return Table(j_table=_JTableLoggers.updatePerformanceLog()) except Exception as e: raise DHError(e, "failed to obtain the update performance log table.") from e
def process_info_log() -> Table: """ Returns a static table with process information for the current Deephaven engine process. Returns: a Table Raises: DHError """ try: return Table(j_table=_JTableLoggers.processInfoLog()) except Exception as e: raise DHError(e, "failed to obtain the process info log table.") from e
def process_metrics_log() -> Table: """ Returns a table with metrics collected for the current Deephaven engine process. Returns: a Table Raises: DHError """ try: return Table(j_table=_JTableLoggers.processMetricsLog()) except Exception as e: raise DHError(e, "failed to obtain the process metrics log table.") from e
def server_state_log() -> Table: """ Returns a table with memory utilization, update graph processor and garbage collection stats sampled on a periodic basis. Returns: a Table Raises: DHError """ try: return Table(j_table=_JTableLoggers.serverStateLog()) except Exception as e: raise DHError(e, "failed to obtain the server state log table.") from e
def query_performance_log() -> Table: """ Returns a table with Deephaven query performance data. Performance data for individual sub-operations is available from calling `query_operation_performance_log`. Returns: a Table Raises: DHError """ try: return Table(j_table=_JTableLoggers.queryPerformanceLog()) except Exception as e: raise DHError(e, "failed to obtain the query performance log table.") from e
def query_operation_performance_log() -> Table: """ Returns a table with Deephaven performance data for individual subqueries. Performance data for the entire query is available from calling 'query_performance_log'. Returns: a Table Raises: DHError """ try: return Table(j_table=_JTableLoggers.queryOperationPerformanceLog()) except Exception as e: raise DHError(e, "failed to obtain the query operation performance log table.") from e
def new_table(cols: List[InputColumn]) -> Table: """Creates an in-memory table from a list of input columns. Each column must have an equal number of elements. Args: cols (List[InputColumn]): a list of InputColumn Returns: a Table Raises: DHError """ try: return Table(j_table=_JTableFactory.newTable(*[col.j_column for col in cols])) except Exception as e: raise DHError(e, "failed to create a new time table.") from e
def empty_table(size: int) -> Table: """Creates a table with rows but no columns. Args: size (int): the number of rows Returns: a Table Raises: DHError """ try: return Table(j_table=_JTableTools.emptyTable(size)) except Exception as e: raise DHError(e, "failed to create an empty table.") from e
def __init__(self, col_defs: Dict[str, DType]): """Initializes the writer and creates a new in-memory table. Args: col_defs(Dict[str, DTypes]): a map of column names and types of the new table Raises: DHError """ col_names = list(col_defs.keys()) col_dtypes = list(col_defs.values()) try: self._j_table_writer = _JDynamicTableWriter(col_names, [t.qst_type for t in col_dtypes]) self.table = Table(j_table=self._j_table_writer.getTable()) except Exception as e: raise DHError(e, "failed to create a DynamicTableWriter.") from e
def merge(tables: List[Table]): """Combines two or more tables into one aggregate table. This essentially appends the tables one on top of the other. Null tables are ignored. Args: tables (List[Table]): the source tables Returns: a Table Raises: DHError """ try: return Table(j_table=_JTableTools.merge([t.j_table for t in tables])) except Exception as e: raise DHError(e, "merge tables operation failed.") from e
def stream_to_append_only(table: Table) -> Table: """ Creates an 'append only' table from the stream table. Args: table (Table): a stream table Returns: an append-only table Raises: DHError """ try: return Table( j_table=_JStreamTableTools.streamToAppendOnlyTable(table.j_table)) except Exception as e: raise DHError(e, "failed to create an append-only table.") from e
def add_table(self, table: Table, col: str) -> Table: """Registers a table for replaying and returns the associated replay table. Args: table (Table): the table to be replayed col (str): column in the table containing timestamps Returns: a replay Table Raises: DHError """ try: replay_table = Table(j_table=self._j_replayer.replay(table.j_table, col)) return replay_table except Exception as e: raise DHError(e, "failed to add a historical table.") from e
def merge_sorted(tables: List[Table], order_by: str) -> Table: """Combines two or more tables into one sorted, aggregate table. This essentially stacks the tables one on top of the other and sorts the result. Null tables are ignored. mergeSorted is more efficient than using merge followed by sort. Args: tables (List[Table]): the source tables order_by (str): the name of the key column Returns: a Table Raises: DHError """ try: return Table(j_table=_JTableTools.mergeSorted(order_by, *[t.j_table for t in tables])) except Exception as e: raise DHError(e, "merge sorted operation failed.") from e
def query_update_performance(eval_number: int) -> Table: """ Takes in a query evaluation number and returns a view for that query's update performance data. You can obtain query evaluation numbers, which uniquely identify a query and its subqueries, via the performance data tables obtained from calling query_performance_log() or query_operation_performance_log() Args: eval_number (int): the evaluation number Returns: a Table of query update performance data Raises: DHError """ try: return Table(j_table=_JPerformanceQueries.queryUpdatePerformance(eval_number)) except Exception as e: raise DHError(e, "failed to obtain the query update performance data.") from e
def query_update_performance_map(eval_number: int) -> Dict[str, Table]: """ Creates multiple tables with performance data for a given query identified by an evaluation number. The tables are returned in a map with the following String keys: 'QueryUpdatePerformance', 'UpdateWorst', 'WorstInterval', 'UpdateMostRecent', 'UpdateAggregate', 'UpdateSummaryStats'. Args: eval_number (int): the evaluation number Returns: a dict Raises: DHError """ try: d = j_map_to_dict(_JPerformanceQueries.queryUpdatePerformanceMap(eval_number)) for k in d.keys(): d[k] = Table(j_table=d[k]) return d except Exception as e: raise DHError(e, "failed to obtain the query update perf map.") from e
def consume( kafka_config: Dict, cdc_spec: CDCSpec, partitions: List[int] = None, stream_table: bool = False, cols_to_drop: List[str] = None, ) -> Table: """ Consume from a Change Data Capture (CDC) Kafka stream (as, eg, produced by Debezium), tracking the underlying database table to a Deephaven table. Args: kafka_config (Dict): configuration for the associated kafka consumer and also the resulting table. Passed to the org.apache.kafka.clients.consumer.KafkaConsumer constructor; pass any KafkaConsumer specific desired configuration here. Note this should include the relevant property for a schema server URL where the key and/or value Avro necessary schemas are stored. cdc_spec (CDCSpec): a CDCSpec obtained from calling either the cdc_long_spec or the cdc_short_spec function partitions (List[int]: a list of integer partition numbers, default is None indicating all partitions stream_table (bool): if true, produce a streaming table of changed rows keeping the CDC 'op' column indicating the type of column change; if false, return a Deephaven ticking table that tracks the underlying database table through the CDC Stream. cols_to_drop (list[str]): a list of column names to omit from the resulting DHC table. Note that only columns not included in the primary key for the table can be dropped at this stage; you can chain a drop column operation after this call if you need to do this. Returns: a Deephaven live table that will update based on the CDC messages consumed for the given topic Raises: DHError """ try: partitions = j_partitions(partitions) kafka_config = j_properties(kafka_config) return Table(j_table=_JCdcTools.consumeToTable( kafka_config, cdc_spec.j_object, partitions, stream_table, cols_to_drop)) except Exception as e: raise DHError(e, "failed to consume a CDC stream.") from e
def learn(table: Table = None, model_func: Callable = None, inputs: List[Input] = [], outputs: List[Output] = [], batch_size: int = None) -> Table: """ Learn gathers data from multiple rows of the input table, performs a calculation, and scatters values from the calculation into an output table. This is a common computing paradigm for artificial intelligence, machine learning, and deep learning. Args: table (Table): the Deephaven table to perform computations on. model_func (Callable): function that performs computations on the table. inputs (List[Input]): list of Input objects that determine how data gets extracted from the table. outputs (List[Output]): list of Output objects that determine how data gets scattered back into the results table. batch_size (int): maximum number of rows for which model_func is evaluated at once. Returns: a Table with added columns containing the results of evaluating model_func. Raises: DHError """ try: _validate(inputs, outputs, table) if batch_size is None: raise ValueError( "Batch size cannot be inferred. Please specify a batch size.") # TODO: When ticket #1072 is resolved, the following code should be replaced with # Globals["__computer"] = _Computer_(table, model_func, [input.input for input in inputs], batch_size) # and remove from globals at the end of function (jpy.get_type("io.deephaven.engine.table.lang.QueryScope").addParam( "__computer", _JLearnComputer(table.j_table, model_func, [input_.input for input_ in inputs], batch_size))) future_offset = _create_non_conflicting_col_name( table, "__FutureOffset") clean = _create_non_conflicting_col_name(table, "__CleanComputer") if outputs is not None: __scatterer = _JLearnScatterer( [output.output for output in outputs]) # TODO: Similarly at resolution of #1072, replace the following code with # Globals["__scatterer"] = __scatterer # and remove from Globals at end of function jpy.get_type("io.deephaven.engine.table.lang.QueryScope").addParam( "__scatterer", __scatterer) return (table.update(formulas=[ f"{future_offset} = __computer.compute(k)", ]).update(formulas=[ __scatterer.generateQueryStrings(f"{future_offset}"), ]).update(formulas=[ f"{clean} = __computer.clear()", ]).drop_columns(cols=[ f"{future_offset}", f"{clean}", ])) result = _create_non_conflicting_col_name(table, "__Result") return (table.update(formulas=[ f"{future_offset} = __computer.compute(k)", f"{result} = {future_offset}.getFuture().get()", f"{clean} = __computer.clear()", ]).drop_columns(cols=[ f"{future_offset}", f"{clean}", f"{result}", ])) except Exception as e: raise DHError(e, "failed to complete the learn function.") from e
def apply(t: Table) -> Table: return t.update("f = a + b")
def apply(self, t: Table, ot: Table) -> Table: return t.natural_join(ot, on=["a", "b"], joins=["f"])
def read( path: str, header: Dict[str, dht.DType] = None, headless: bool = False, skip_rows: int = 0, num_rows: int = MAX_LONG, ignore_empty_lines: bool = False, allow_missing_columns: bool = False, ignore_excess_columns: bool = False, delimiter: str = ",", quote: str = '"', ignore_surrounding_spaces: bool = True, trim: bool = False, ) -> Table: """Read the CSV data specified by the path parameter as a table. Args: path (str): a file path or a URL string header (Dict[str, DType]): a dict to define the table columns with key being the name, value being the data type skip_rows (long): number of data rows to skip before processing data. This is useful when you want to parse data in chunks. Defaults to 0 num_rows (long): max number of rows to process. This is useful when you want to parse data in chunks. Defaults to {@link Long#MAX_VALUE} allow_missing_columns (bool): whether the library should allow missing columns in the input. If this flag is set, then rows that are too short (that have fewer columns than the header row) will be interpreted as if the missing columns contained the empty string. Defaults to false. ignore_excess_columns (bool): whether the library should allow excess columns in the input. If this flag is set, then rows that are too long (that have more columns than the header row) will have those excess columns dropped. Defaults to false. headless (bool): indicates if the CSV data is headless, default is False delimiter (str): the delimiter used by the CSV, default is the comma quote (str): the quote character for the CSV, default is double quote ignore_surrounding_spaces (bool): Indicates whether surrounding white space should be ignored for unquoted text fields, default is True trim (bool): indicates whether to trim white space inside a quoted string, default is False Returns: a table Raises: DHError """ try: csv_specs_builder = _JCsvTools.builder() if header: csv_specs_builder.headers(_JArrays.asList(list(header.keys()))) parser_map = { dht.bool_: _JParsers.BOOLEAN, dht.byte: _JParsers.BYTE, dht.char: _JParsers.CHAR, dht.short: _JParsers.SHORT, dht.int_: _JParsers.INT, dht.long: _JParsers.LONG, dht.float_: _JParsers.FLOAT_FAST, dht.double: _JParsers.DOUBLE, dht.string: _JParsers.STRING, dht.DateTime: _JParsers.DATETIME, } for column_name, column_type in header.items(): csv_specs_builder.putParserForName(column_name, parser_map[column_type]) csv_specs = (csv_specs_builder.hasHeaderRow(not headless).skipRows( skip_rows).numRows(num_rows).ignoreEmptyLines( ignore_empty_lines).allowMissingColumns(allow_missing_columns). ignoreExcessColumns(ignore_excess_columns).delimiter( ord(delimiter)).quote( ord(quote)).ignoreSurroundingSpaces( ignore_surrounding_spaces).trim(trim).build()) j_table = _JCsvTools.readCsv(path, csv_specs) return Table(j_table=j_table) except Exception as e: raise DHError(e, "read csv failed") from e
def partitioned_transform_func(t: Table, ot: Table) -> Table: return t.natural_join(ot, on=["a", "b"], joins=["f"])
def transform_func(t: Table) -> Table: return t.update("f = a + b")
def consume( kafka_config: Dict, topic: str, partitions: List[int] = None, offsets: Dict[int, int] = None, key_spec: KeyValueSpec = None, value_spec: KeyValueSpec = None, table_type: TableType = TableType.stream(), ) -> Table: """Consume from Kafka to a Deephaven table. Args: kafka_config (Dict): configuration for the associated Kafka consumer and also the resulting table. Once the table-specific properties are stripped, the remaining one is used to call the constructor of org.apache.kafka.clients.consumer.KafkaConsumer; pass any KafkaConsumer specific desired configuration here topic (str): the Kafka topic name partitions (List[int]) : a list of integer partition numbers, default is None which means all partitions offsets (Dict[int, int]) : a mapping between partition numbers and offset numbers, and can be one of the predefined ALL_PARTITIONS_SEEK_TO_BEGINNING, ALL_PARTITIONS_SEEK_TO_END or ALL_PARTITIONS_DONT_SEEK. The default is None which works the same as ALL_PARTITIONS_DONT_SEEK. The offset numbers may be one of the predefined SEEK_TO_BEGINNING, SEEK_TO_END, or DONT_SEEK. key_spec (KeyValueSpec): specifies how to map the Key field in Kafka messages to Deephaven column(s). It can be the result of calling one of the functions: simple_spec(),avro_spec() or json_spec() in this module, or the predefined KeyValueSpec.IGNORE or KeyValueSpec.FROM_PROPERTIES. The default is None which works the same as KeyValueSpec.FROM_PROPERTIES, in which case, the kafka_config param should include values for dictionary keys 'deephaven.key.column.name' and 'deephaven.key.column.type', for the single resulting column name and type value_spec (KeyValueSpec): specifies how to map the Value field in Kafka messages to Deephaven column(s). It can be the result of calling one of the functions: simple_spec(),avro_spec() or json_spec() in this module, or the predefined KeyValueSpec.IGNORE or KeyValueSpec.FROM_PROPERTIES. The default is None which works the same as KeyValueSpec.FROM_PROPERTIES, in which case, the kafka_config param should include values for dictionary keys 'deephaven.key.column.name' and 'deephaven.key.column.type', for the single resulting column name and type table_type (TableType): a TableType enum, default is TableType.stream() Returns: a Deephaven live table that will update based on Kafka messages consumed for the given topic Raises: DHError """ try: partitions = j_partitions(partitions) if offsets is None or offsets == ALL_PARTITIONS_DONT_SEEK: offsets = _ALL_PARTITIONS_DONT_SEEK elif offsets == ALL_PARTITIONS_SEEK_TO_BEGINNING: offsets = _ALL_PARTITIONS_SEEK_TO_BEGINNING elif offsets == ALL_PARTITIONS_SEEK_TO_END: offsets = _ALL_PARTITIONS_SEEK_TO_END else: partitions_array = jpy.array("int", list(offsets.keys())) offsets_array = jpy.array("long", list(offsets.values())) offsets = _JKafkaTools.partitionToOffsetFromParallelArrays( partitions_array, offsets_array) key_spec = KeyValueSpec.FROM_PROPERTIES if key_spec is None else key_spec value_spec = KeyValueSpec.FROM_PROPERTIES if value_spec is None else value_spec if key_spec is KeyValueSpec.IGNORE and value_spec is KeyValueSpec.IGNORE: raise ValueError( "at least one argument for 'key' or 'value' must be different from KeyValueSpec.IGNORE" ) kafka_config = j_properties(kafka_config) return Table(j_table=_JKafkaTools.consumeToTable( kafka_config, topic, partitions, offsets, key_spec.j_object, value_spec.j_object, table_type.j_object, )) except Exception as e: raise DHError(e, "failed to consume a Kafka stream.") from e