Exemple #1
0
def read(path: str,
         col_instructions: List[ColumnInstruction] = None,
         is_legacy_parquet: bool = False) -> Table:
    """ Reads in a table from a single parquet, metadata file, or directory with recognized layout.

    Args:
        path (str): the file or directory to examine
        col_instructions (List[ColumnInstruction]): instructions for customizations while reading
        is_legacy_parquet (bool): if the parquet data is legacy

    Returns:
        a table

    Raises:
        DHError
    """

    try:
        read_instructions = _build_parquet_instructions(
            col_instructions=col_instructions,
            is_legacy_parquet=is_legacy_parquet,
            for_read=True)

        if read_instructions:
            return Table(
                j_table=_JParquetTools.readTable(path, read_instructions))
        else:
            return Table(j_table=_JParquetTools.readTable(path))
    except Exception as e:
        raise DHError(e, "failed to read parquet data.") from e
Exemple #2
0
def consume_raw(
        kafka_config: dict,
        cdc_spec: CDCSpec,
        partitions=None,
        table_type: TableType = TableType.stream(),
) -> Table:
    """ Consume the raw events from a Change Data Capture (CDC) Kafka stream to a Deephaven table.

    Args:
        kafka_config (Dict): configuration for the associated kafka consumer and also the resulting table. Passed
            to the org.apache.kafka.clients.consumer.KafkaConsumer constructor; pass any KafkaConsumer specific desired
            configuration here. Note this should include the relevant property for a schema server URL where the key
            and/or value Avro necessary schemas are stored.
        cdc_spec (CDCSpec): a CDCSpec obtained from calling either the cdc_long_spec or the cdc_short_spec function
        partitions (List[int]: a list of integer partition numbers, default is None indicating all partitions
        table_type (TableType): a TableType enum, default is TableType.stream()

    Returns:
        a Deephaven live table for the raw CDC events

    Raises:
        DHError
    """
    try:
        partitions = j_partitions(partitions)
        kafka_config = j_properties(kafka_config)
        table_type_enum = table_type.value
        return Table(j_table=_JCdcTools.consumeRawToTable(
            kafka_config, cdc_spec.j_object, partitions, table_type_enum))
    except Exception as e:
        raise DHError(e, "failed to consume a raw CDC stream.") from e
Exemple #3
0
def time_window(table: Table, ts_col: str, window: int,
                bool_col: str) -> Table:
    """Creates a new table by applying a time window to the source table and adding a new Boolean column.

    The value of the new Boolean column is set to false when the timestamp column value is older than the window from
    now or true otherwise. If the timestamp column value is null, the Boolean column value will be null as well. The
    result table ticks whenever the source table ticks, or modifies a row when it passes out of the window.

    Args:
        table (Table): the source table
        ts_col (str): the timestamp column name
        window (int): the size of the window in nanoseconds
        bool_col (str): the name of the new Boolean column.

    Returns:
        a new Table

    Raises:
        DHError
    """
    try:
        return Table(j_table=_JWindowCheck.addTimeWindow(
            table.j_table, ts_col, window, bool_col))
    except Exception as e:
        raise DHError(e, "failed to create a time window table.") from e
Exemple #4
0
def query_performance(eval_number: int) -> Table:
    """ Takes in a query evaluation number and returns a view for that query's performance data.

    You can obtain query evaluation numbers, which uniquely identify a query and its subqueries, via the performance
    data tables obtained from calling query_performance_log() or query_operation_performance_log()

     The query performance log contains data on how long each query takes to run. Examples of what constitutes one
     individual query, for performance logging purposes, include:

        * A new command in the console (i.e. type something, then press the return key)
        * A sort, filter, or custom column generated by a UI
        * A call from a client API external application

    Args:
        eval_number (int): the evaluation number

    Returns:
        a Table of query performance data

    Raises:
        DHError
    """
    try:
        return Table(j_table=_JPerformanceQueries.queryPerformance(eval_number))
    except Exception as e:
        raise DHError(e, "failed to obtain the query performance data.") from e
Exemple #5
0
def query_operation_performance(eval_number: int) -> Table:
    """ Takes in a query evaluation number and returns a view for that query's individual operation's performance data.

    You can obtain query evaluation numbers, which uniquely identify a query and its subqueries, via the performance
    data tables obtained from calling query_performance_log() or query_operation_performance_log()

    The query operation performance log contains data on how long each individual operation of a query (where(),
    update(), naturalJoin(), etc., as well as internal functions) takes to execute, and the change in resource
    consumption while each was executing.

    Args:
        eval_number (int): the evaluation number

    Returns:
        a table of query operation performance data

    Raises:
        DHError
    """
    try:
        return Table(j_table=_JPerformanceQueries.queryOperationPerformance(
            eval_number))
    except Exception as e:
        raise DHError(
            e, "failed to obtain the query operation performance data.") from e
Exemple #6
0
def server_state() -> Table:
    """ Returns a table of basic memory, update graph processor, and GC stats for the current engine process,
        sampled on a periodic basis.

    Returns:
        a table
    """
    try:
        return Table(j_table=_JPerformanceQueries.serverState())
    except Exception as e:
        raise DHError(e, "failed to produce a table with server state info.") from e
def time_table(period: str, start_time: str = None) -> Table:
    """Creates a table that adds a new row on a regular interval.

    Args:
        period (str): time interval between new row additions
        start_time (str): start time for adding new rows

    Returns:
        a Table

    Raises:
        DHError
    """
    try:
        if start_time:
            return Table(j_table=_JTableTools.timeTable(start_time, period))
        else:
            return Table(j_table=_JTableTools.timeTable(period))

    except Exception as e:
        raise DHError(e, "failed to create a time table.") from e
Exemple #8
0
def update_performance_log() -> Table:
    """ Returns a table with Deephaven update performance data.

    Returns
        a Table

    Raises:
        DHError
    """
    try:
        return Table(j_table=_JTableLoggers.updatePerformanceLog())
    except Exception as e:
        raise DHError(e, "failed to obtain the update performance log table.") from e
Exemple #9
0
def process_info_log() -> Table:
    """ Returns a static table with process information for the current Deephaven engine process.

    Returns:
        a Table

    Raises:
        DHError
    """
    try:
        return Table(j_table=_JTableLoggers.processInfoLog())
    except Exception as e:
        raise DHError(e, "failed to obtain the process info log table.") from e
Exemple #10
0
def process_metrics_log() -> Table:
    """ Returns a table with metrics collected for the current Deephaven engine process.

    Returns:
        a Table

    Raises:
        DHError
    """
    try:
        return Table(j_table=_JTableLoggers.processMetricsLog())
    except Exception as e:
        raise DHError(e, "failed to obtain the process metrics log table.") from e
Exemple #11
0
def server_state_log() -> Table:
    """ Returns a table with memory utilization, update graph processor and garbage collection stats
        sampled on a periodic basis.

    Returns:
        a Table

    Raises:
        DHError
    """
    try:
        return Table(j_table=_JTableLoggers.serverStateLog())
    except Exception as e:
        raise DHError(e, "failed to obtain the server state log table.") from e
Exemple #12
0
def query_performance_log() -> Table:
    """ Returns a table with Deephaven query performance data. Performance data for individual sub-operations is
    available from calling `query_operation_performance_log`.

    Returns:
        a Table

    Raises:
        DHError
    """
    try:
        return Table(j_table=_JTableLoggers.queryPerformanceLog())
    except Exception as e:
        raise DHError(e, "failed to obtain the query performance log table.") from e
Exemple #13
0
def query_operation_performance_log() -> Table:
    """ Returns a table with Deephaven performance data for individual subqueries. Performance data for the entire query
    is available from calling 'query_performance_log'.

    Returns:
        a Table

    Raises:
        DHError
    """
    try:
        return Table(j_table=_JTableLoggers.queryOperationPerformanceLog())
    except Exception as e:
        raise DHError(e, "failed to obtain the query operation performance log table.") from e
def new_table(cols: List[InputColumn]) -> Table:
    """Creates an in-memory table from a list of input columns. Each column must have an equal number of elements.

    Args:
        cols (List[InputColumn]): a list of InputColumn

    Returns:
        a Table

    Raises:
        DHError
    """
    try:
        return Table(j_table=_JTableFactory.newTable(*[col.j_column for col in cols]))
    except Exception as e:
        raise DHError(e, "failed to create a new time table.") from e
def empty_table(size: int) -> Table:
    """Creates a table with rows but no columns.

    Args:
        size (int): the number of rows

    Returns:
         a Table

    Raises:
        DHError
    """
    try:
        return Table(j_table=_JTableTools.emptyTable(size))
    except Exception as e:
        raise DHError(e, "failed to create an empty table.") from e
    def __init__(self, col_defs: Dict[str, DType]):
        """Initializes the writer and creates a new in-memory table.

        Args:
            col_defs(Dict[str, DTypes]): a map of column names and types of the new table

        Raises:
            DHError
        """
        col_names = list(col_defs.keys())
        col_dtypes = list(col_defs.values())
        try:
            self._j_table_writer = _JDynamicTableWriter(col_names, [t.qst_type for t in col_dtypes])
            self.table = Table(j_table=self._j_table_writer.getTable())
        except Exception as e:
            raise DHError(e, "failed to create a DynamicTableWriter.") from e
def merge(tables: List[Table]):
    """Combines two or more tables into one aggregate table. This essentially appends the tables one on top of the
    other. Null tables are ignored.

    Args:
        tables (List[Table]): the source tables

    Returns:
        a Table

    Raises:
        DHError
    """
    try:
        return Table(j_table=_JTableTools.merge([t.j_table for t in tables]))
    except Exception as e:
        raise DHError(e, "merge tables operation failed.") from e
Exemple #18
0
def stream_to_append_only(table: Table) -> Table:
    """ Creates an 'append only' table from the stream table.

    Args:
        table (Table): a stream table

    Returns:
        an append-only table

    Raises:
        DHError
    """
    try:
        return Table(
            j_table=_JStreamTableTools.streamToAppendOnlyTable(table.j_table))
    except Exception as e:
        raise DHError(e, "failed to create an append-only table.") from e
Exemple #19
0
    def add_table(self, table: Table, col: str) -> Table:
        """Registers a table for replaying and returns the associated replay table.

        Args:
            table (Table): the table to be replayed
            col (str): column in the table containing timestamps

        Returns:
            a replay Table

        Raises:
            DHError
        """
        try:
            replay_table = Table(j_table=self._j_replayer.replay(table.j_table, col))
            return replay_table
        except Exception as e:
            raise DHError(e, "failed to add a historical table.") from e
def merge_sorted(tables: List[Table], order_by: str) -> Table:
    """Combines two or more tables into one sorted, aggregate table. This essentially stacks the tables one on top
    of the other and sorts the result. Null tables are ignored. mergeSorted is more efficient than using merge
    followed by sort.

    Args:
        tables (List[Table]): the source tables
        order_by (str): the name of the key column

    Returns:
         a Table

    Raises:
        DHError
    """
    try:
        return Table(j_table=_JTableTools.mergeSorted(order_by, *[t.j_table for t in tables]))
    except Exception as e:
        raise DHError(e, "merge sorted operation failed.") from e
Exemple #21
0
def query_update_performance(eval_number: int) -> Table:
    """  Takes in a query evaluation number and returns a view for that query's update performance data.

    You can obtain query evaluation numbers, which uniquely identify a query and its subqueries, via the performance
    data tables obtained from calling query_performance_log() or query_operation_performance_log()

    Args:
        eval_number (int): the evaluation number

    Returns:
        a Table of query update performance data

    Raises:
        DHError
    """
    try:
        return Table(j_table=_JPerformanceQueries.queryUpdatePerformance(eval_number))
    except Exception as e:
        raise DHError(e, "failed to obtain the query update performance data.") from e
Exemple #22
0
def query_update_performance_map(eval_number: int) -> Dict[str, Table]:
    """ Creates multiple tables with performance data for a given query identified by an evaluation number. The tables
     are returned in a map with the following String keys: 'QueryUpdatePerformance', 'UpdateWorst', 'WorstInterval',
     'UpdateMostRecent', 'UpdateAggregate', 'UpdateSummaryStats'.

    Args:
        eval_number (int): the evaluation number

    Returns:
        a dict

    Raises:
        DHError
    """

    try:
        d = j_map_to_dict(_JPerformanceQueries.queryUpdatePerformanceMap(eval_number))
        for k in d.keys():
            d[k] = Table(j_table=d[k])
        return d
    except Exception as e:
        raise DHError(e, "failed to obtain the query update perf map.") from e
Exemple #23
0
def consume(
    kafka_config: Dict,
    cdc_spec: CDCSpec,
    partitions: List[int] = None,
    stream_table: bool = False,
    cols_to_drop: List[str] = None,
) -> Table:
    """ Consume from a Change Data Capture (CDC) Kafka stream (as, eg, produced by Debezium), tracking the underlying
    database table to a Deephaven table.

    Args:
        kafka_config (Dict): configuration for the associated kafka consumer and also the resulting table. Passed
            to the org.apache.kafka.clients.consumer.KafkaConsumer constructor; pass any KafkaConsumer specific desired
            configuration here. Note this should include the relevant property for a schema server URL where the key
            and/or value Avro necessary schemas are stored.
        cdc_spec (CDCSpec): a CDCSpec obtained from calling either the cdc_long_spec or the cdc_short_spec function
        partitions (List[int]: a list of integer partition numbers, default is None indicating all partitions
        stream_table (bool):  if true, produce a streaming table of changed rows keeping the CDC 'op' column
            indicating the type of column change; if false, return a Deephaven ticking table that tracks the underlying
            database table through the CDC Stream.
        cols_to_drop (list[str]): a list of column names to omit from the resulting DHC table. Note that only columns
            not included in the primary key for the table can be dropped at this stage; you can chain a drop column
            operation after this call if you need to do this.

    Returns:
        a Deephaven live table that will update based on the CDC messages consumed for the given topic

    Raises:
        DHError
    """
    try:
        partitions = j_partitions(partitions)
        kafka_config = j_properties(kafka_config)
        return Table(j_table=_JCdcTools.consumeToTable(
            kafka_config, cdc_spec.j_object, partitions, stream_table,
            cols_to_drop))
    except Exception as e:
        raise DHError(e, "failed to consume a CDC stream.") from e
Exemple #24
0
def learn(table: Table = None,
          model_func: Callable = None,
          inputs: List[Input] = [],
          outputs: List[Output] = [],
          batch_size: int = None) -> Table:
    """ Learn gathers data from multiple rows of the input table, performs a calculation, and scatters values from the
    calculation into an output table. This is a common computing paradigm for artificial intelligence, machine learning,
    and deep learning.

    Args:
        table (Table): the Deephaven table to perform computations on.
        model_func (Callable): function that performs computations on the table.
        inputs (List[Input]): list of Input objects that determine how data gets extracted from the table.
        outputs (List[Output]): list of Output objects that determine how data gets scattered back into the results table.
        batch_size (int): maximum number of rows for which model_func is evaluated at once.

    Returns:
        a Table with added columns containing the results of evaluating model_func.

    Raises:
        DHError
    """

    try:
        _validate(inputs, outputs, table)

        if batch_size is None:
            raise ValueError(
                "Batch size cannot be inferred. Please specify a batch size.")

        # TODO: When ticket #1072 is resolved, the following code should be replaced with
        # Globals["__computer"] = _Computer_(table, model_func, [input.input for input in inputs], batch_size)
        # and remove from globals at the end of function
        (jpy.get_type("io.deephaven.engine.table.lang.QueryScope").addParam(
            "__computer",
            _JLearnComputer(table.j_table, model_func,
                            [input_.input for input_ in inputs], batch_size)))

        future_offset = _create_non_conflicting_col_name(
            table, "__FutureOffset")
        clean = _create_non_conflicting_col_name(table, "__CleanComputer")

        if outputs is not None:
            __scatterer = _JLearnScatterer(
                [output.output for output in outputs])
            # TODO: Similarly at resolution of #1072, replace the following code with
            # Globals["__scatterer"] = __scatterer
            # and remove from Globals at end of function
            jpy.get_type("io.deephaven.engine.table.lang.QueryScope").addParam(
                "__scatterer", __scatterer)

            return (table.update(formulas=[
                f"{future_offset} = __computer.compute(k)",
            ]).update(formulas=[
                __scatterer.generateQueryStrings(f"{future_offset}"),
            ]).update(formulas=[
                f"{clean} = __computer.clear()",
            ]).drop_columns(cols=[
                f"{future_offset}",
                f"{clean}",
            ]))

        result = _create_non_conflicting_col_name(table, "__Result")

        return (table.update(formulas=[
            f"{future_offset} = __computer.compute(k)",
            f"{result} = {future_offset}.getFuture().get()",
            f"{clean} = __computer.clear()",
        ]).drop_columns(cols=[
            f"{future_offset}",
            f"{clean}",
            f"{result}",
        ]))
    except Exception as e:
        raise DHError(e, "failed to complete the learn function.") from e
 def apply(t: Table) -> Table:
     return t.update("f = a + b")
 def apply(self, t: Table, ot: Table) -> Table:
     return t.natural_join(ot, on=["a", "b"], joins=["f"])
Exemple #27
0
def read(
    path: str,
    header: Dict[str, dht.DType] = None,
    headless: bool = False,
    skip_rows: int = 0,
    num_rows: int = MAX_LONG,
    ignore_empty_lines: bool = False,
    allow_missing_columns: bool = False,
    ignore_excess_columns: bool = False,
    delimiter: str = ",",
    quote: str = '"',
    ignore_surrounding_spaces: bool = True,
    trim: bool = False,
) -> Table:
    """Read the CSV data specified by the path parameter as a table.

    Args:
        path (str): a file path or a URL string
        header (Dict[str, DType]): a dict to define the table columns with key being the name, value being the data type
        skip_rows (long): number of data rows to skip before processing data. This is useful when you want to parse data in chunks. Defaults to 0
        num_rows (long): max number of rows to process. This is useful when you want to parse data in chunks. Defaults to {@link Long#MAX_VALUE} 
        allow_missing_columns (bool): whether the library should allow missing columns in the input. If this flag is set, then rows that are too short (that have fewer columns than the header row) will be interpreted as if the missing columns contained the empty string. Defaults to false.
        ignore_excess_columns (bool): whether the library should allow excess columns in the input. If this flag is set, then rows that are too long (that have more columns than the header row) will have those excess columns dropped. Defaults to false.
        headless (bool): indicates if the CSV data is headless, default is False
        delimiter (str): the delimiter used by the CSV, default is the comma
        quote (str): the quote character for the CSV, default is double quote
        ignore_surrounding_spaces (bool): Indicates whether surrounding white space should be ignored for unquoted text fields, default is True
        trim (bool): indicates whether to trim white space inside a quoted string, default is False

    Returns:
        a table

    Raises:
        DHError
    """
    try:
        csv_specs_builder = _JCsvTools.builder()

        if header:
            csv_specs_builder.headers(_JArrays.asList(list(header.keys())))
            parser_map = {
                dht.bool_: _JParsers.BOOLEAN,
                dht.byte: _JParsers.BYTE,
                dht.char: _JParsers.CHAR,
                dht.short: _JParsers.SHORT,
                dht.int_: _JParsers.INT,
                dht.long: _JParsers.LONG,
                dht.float_: _JParsers.FLOAT_FAST,
                dht.double: _JParsers.DOUBLE,
                dht.string: _JParsers.STRING,
                dht.DateTime: _JParsers.DATETIME,
            }
            for column_name, column_type in header.items():
                csv_specs_builder.putParserForName(column_name,
                                                   parser_map[column_type])

        csv_specs = (csv_specs_builder.hasHeaderRow(not headless).skipRows(
            skip_rows).numRows(num_rows).ignoreEmptyLines(
                ignore_empty_lines).allowMissingColumns(allow_missing_columns).
                     ignoreExcessColumns(ignore_excess_columns).delimiter(
                         ord(delimiter)).quote(
                             ord(quote)).ignoreSurroundingSpaces(
                                 ignore_surrounding_spaces).trim(trim).build())

        j_table = _JCsvTools.readCsv(path, csv_specs)

        return Table(j_table=j_table)
    except Exception as e:
        raise DHError(e, "read csv failed") from e
def partitioned_transform_func(t: Table, ot: Table) -> Table:
    return t.natural_join(ot, on=["a", "b"], joins=["f"])
def transform_func(t: Table) -> Table:
    return t.update("f = a + b")
Exemple #30
0
def consume(
        kafka_config: Dict,
        topic: str,
        partitions: List[int] = None,
        offsets: Dict[int, int] = None,
        key_spec: KeyValueSpec = None,
        value_spec: KeyValueSpec = None,
        table_type: TableType = TableType.stream(),
) -> Table:
    """Consume from Kafka to a Deephaven table.

    Args:
        kafka_config (Dict): configuration for the associated Kafka consumer and also the resulting table.
            Once the table-specific properties are stripped, the remaining one is used to call the constructor of
            org.apache.kafka.clients.consumer.KafkaConsumer; pass any KafkaConsumer specific desired configuration here
        topic (str): the Kafka topic name
        partitions (List[int]) : a list of integer partition numbers, default is None which means all partitions
        offsets (Dict[int, int]) : a mapping between partition numbers and offset numbers, and can be one of the
            predefined ALL_PARTITIONS_SEEK_TO_BEGINNING, ALL_PARTITIONS_SEEK_TO_END or ALL_PARTITIONS_DONT_SEEK.
            The default is None which works the same as  ALL_PARTITIONS_DONT_SEEK. The offset numbers may be one
            of the predefined SEEK_TO_BEGINNING, SEEK_TO_END, or DONT_SEEK.
        key_spec (KeyValueSpec): specifies how to map the Key field in Kafka messages to Deephaven column(s).
            It can be the result of calling one of the functions: simple_spec(),avro_spec() or json_spec() in this
            module, or the predefined KeyValueSpec.IGNORE or KeyValueSpec.FROM_PROPERTIES. The default is None which
            works the same as KeyValueSpec.FROM_PROPERTIES, in which case, the kafka_config param should include values
            for dictionary keys 'deephaven.key.column.name' and 'deephaven.key.column.type', for the single resulting
            column name and type
        value_spec (KeyValueSpec): specifies how to map the Value field in Kafka messages to Deephaven column(s).
            It can be the result of calling one of the functions: simple_spec(),avro_spec() or json_spec() in this
            module, or the predefined KeyValueSpec.IGNORE or KeyValueSpec.FROM_PROPERTIES. The default is None which
            works the same as KeyValueSpec.FROM_PROPERTIES, in which case, the kafka_config param should include values
            for dictionary keys 'deephaven.key.column.name' and 'deephaven.key.column.type', for the single resulting
            column name and type
        table_type (TableType): a TableType enum, default is TableType.stream()

    Returns:
        a Deephaven live table that will update based on Kafka messages consumed for the given topic

    Raises:
        DHError
    """

    try:
        partitions = j_partitions(partitions)

        if offsets is None or offsets == ALL_PARTITIONS_DONT_SEEK:
            offsets = _ALL_PARTITIONS_DONT_SEEK
        elif offsets == ALL_PARTITIONS_SEEK_TO_BEGINNING:
            offsets = _ALL_PARTITIONS_SEEK_TO_BEGINNING
        elif offsets == ALL_PARTITIONS_SEEK_TO_END:
            offsets = _ALL_PARTITIONS_SEEK_TO_END
        else:
            partitions_array = jpy.array("int", list(offsets.keys()))
            offsets_array = jpy.array("long", list(offsets.values()))
            offsets = _JKafkaTools.partitionToOffsetFromParallelArrays(
                partitions_array, offsets_array)

        key_spec = KeyValueSpec.FROM_PROPERTIES if key_spec is None else key_spec
        value_spec = KeyValueSpec.FROM_PROPERTIES if value_spec is None else value_spec

        if key_spec is KeyValueSpec.IGNORE and value_spec is KeyValueSpec.IGNORE:
            raise ValueError(
                "at least one argument for 'key' or 'value' must be different from KeyValueSpec.IGNORE"
            )

        kafka_config = j_properties(kafka_config)
        return Table(j_table=_JKafkaTools.consumeToTable(
            kafka_config,
            topic,
            partitions,
            offsets,
            key_spec.j_object,
            value_spec.j_object,
            table_type.j_object,
        ))
    except Exception as e:
        raise DHError(e, "failed to consume a Kafka stream.") from e