Ejemplo n.º 1
0
def consumeToTable(
    kafka_config: dict,
    cdc_spec,
    partitions=None,
    as_stream_table=False,
    drop_columns=None,
):
    """
    Consume from a Change Data Capture (CDC) Kafka stream (as, eg, produced by Debezium)
    tracking the underlying database table to a Deephaven table.

    :param kafka_config: Dictionary with properties to configure the associated kafka consumer and
         also the resulting table.  Passed to the org.apache.kafka.clients.consumer.KafkaConsumer constructor;
         pass any KafkaConsumer specific desired configuration here.
         Note this should include the relevant property for a schema server URL where the
         key and/or value Avro necessary schemas are stored.
    :param cdc_spec:  A CDC Spec opaque object obtained from calling either the cdc_explict_spec method
                      or the cdc_short_spec method
    :param partitions: Either a sequence of integer partition numbers or the predefined constant
         ALL_PARTITIONS for all partitions.  Defaults to ALL_PARTITIONS if unspecified.
    :param as_stream_table:  If true, produce a streaming table of changed rows keeping
         the CDC 'op' column indicating the type of column change; if false, return
         a DHC ticking table that tracks the underlying database table through the CDC Stream.
    :param drop_columns: A sequence of column names to omit from the resulting DHC table.
         Note that only columns not included in the primary key for the table can be dropped at this stage;
         you can chain a drop column operation after this call if you need to do this.
    :return: A Deephaven live table that will update based on the CDC messages consumed for the given topic.
    :raises: ValueError or TypeError if arguments provided can't be processed.
    """

    partitions = ck._jpy_partitions(partitions)
    kafka_config = _dictToProperties(kafka_config)
    return _java_type_.consumeToTable(kafka_config, cdc_spec, partitions,
                                      as_stream_table, drop_columns)
Ejemplo n.º 2
0
def consumeRawToTable(kafka_config: dict,
                      cdc_spec,
                      partitions=None,
                      table_type: str = 'stream'):
    """
    Consume the raw events from a Change Data Capture (CDC) Kafka stream to a Deephaven table.

    :param kafka_config: Dictionary with properties to configure the associated kafka consumer and
        also the resulting table.  Passed to the org.apache.kafka.clients.consumer.KafkaConsumer constructor;
        pass any KafkaConsumer specific desired configuration here.
        Note this should include the relevant property for a schema server URL where the
        key and/or value Avro necessary schemas are stored.
    :param cdc_spec:     A CDC Spec opaque object obtained from calling either the cdc_explict_spec method
                         or the cdc_short_spec method
    :param partitions:   Either a sequence of integer partition numbers or the predefined constant
        ALL_PARTITIONS for all partitions.  Defaults to ALL_PARTITIONS if unspecified.
    :param table_type:   A string specifying the resulting table type: one of 'stream' (default), 'append',
       'stream_map' or 'append_map'.
    :return: A Deephaven live table for the raw CDC events.
    """
    partitions = ck._jpy_partitions(partitions)
    kafka_config = _dictToProperties(kafka_config)
    table_type_enum = ck._jpy_table_type(table_type)
    return _java_type_.consumeRawToTable(kafka_config, cdc_spec, partitions,
                                         table_type_enum)
Ejemplo n.º 3
0
def produceFromTable(table,
                     kafka_config: dict,
                     topic: str,
                     key,
                     value,
                     last_by_key_columns: bool = False):
    """
    Produce a Kafka stream from a Deephaven table.

    Note that ``table`` must only change in ways that are meaningful when turned into a stream of events over Kafka.

    Two primary use cases are considered:

    **A stream of changes (puts and removes) to a key-value data set**
      In order to handle this efficiently and allow for correct reconstruction of the state at a consumer, it is assumed
      that the input data is the result of a Deephaven aggregation, e.g. agg_all_by, agg_by, or last_by. This means
      that key columns (as specified by ``key``) must not be modified, and no rows should be shifted if there
      are any key columns. Note that specifying ``last_by_key_columns`` as ``true`` can make it easy to satisfy this
      constraint if the input data is not already aggregated.

    **A stream of independent log records**
      In this case, the input table should either be a stream table or should only ever add rows.

    If other use cases are identified, a publication mode or extensible listener framework may be introduced at a later
    date.

    :param table: a Deephaven table used as a source of rows to publish to Kafka.
    :param kafka_config: Dictionary with properties to configure the associated kafka producer.
    :param topic: The topic name
    :param key: A specification for how to map table column(s) to the Key field in produced
           Kafka messages.  This should be the result of calling one of the methods simple, avro or json in this module,
           or the constant IGNORE. The resulting key serializer must map each input tuple to a unique output key.
    :param value: A specification for how to map table column(s) to the Value field in produced
           Kafka messages.  This should be the result of calling one of the methods
           simple, avro or json in this module, or the constant IGNORE.
    :param last_by_key_columns:  Whether to publish only the last record for each unique key.
           Ignored if key is IGNORE.  Otherwise, if last_by_key_columns is true this method will internally perform a
           last_by aggregation on table grouped by the input columns of key and publish to Kafka from the result.
    :return: A callback that, when invoked, stops publishing and cleans up
             subscriptions and resources.
             Users should hold to this callback to ensure liveness for publishing
             for as long as this publishing is desired, and once not desired anymore they should
             invoke it.
    :raises: ValueError or TypeError if arguments provided can't be processed.
    """

    if not _isStr(topic):
        raise ValueError(
            "argument 'topic' has to be of str type, instead got " + topic)

    if key is None:
        raise ValueError("argument 'key' is None")
    if value is None:
        raise ValueError("argument 'value' is None")
    if key is IGNORE and value is IGNORE:
        raise ValueError(
            "at least one argument for 'key' or 'value' must be different from IGNORE"
        )

    kafka_config = _dictToProperties(kafka_config)
    runnable = _java_type_.produceFromTable(table, kafka_config, topic, key,
                                            value, last_by_key_columns)

    def cleanup():
        runnable.run()

    return cleanup
Ejemplo n.º 4
0
def avro(schema,
         schema_version: str = None,
         field_to_col_mapping=None,
         timestamp_field: str = None,
         include_only_columns=None,
         exclude_columns=None,
         publish_schema: bool = False,
         schema_namespace: str = None,
         column_properties=None):
    """
    Specify an Avro schema to use when producing a Kafka stream from a Deephaven table.

    :param schema:  Either an Avro schema object or a string specifying a schema name for a schema
       registered in a Confluent compatible Schema Registry Server.  When the latter is provided, the
       associated kafka_config dict in the call to consumeToTable should include the key
       'schema.registry.url' with the associated value of the Schema Registry Server URL for fetching the schema
       definition.
    :param schema_version:  If a string schema name is provided, the version to fetch from schema
       service; if not specified, a default of 'latest' is assumed.
    :param field_to_col_mapping: A dict mapping field names in the schema to column names in the Deephaven table.
       Any fields in the schema not present in the dict as keys are mapped to columns of the same name (except for any columns
       ignored via exclude_columns).
       If this argument is None, all schema fields are mapped to columns of the same name (except for any columns
       ignored via exclude_columns).
    :param timestamp_field: a string for the name of an additional timestamp field to include,
                            or None for no such field.
    :param include_only_columns If not None, a sequence of column names in the source table to include
           in the generated output.   Only one of include_only_columns and exclude_columns can be different from None.
           Defaults to None.
    :param exclude_columns If not None, a sequence of column names to exclude from the generated output (every other column
           will be included).   Only one of include_only_columns and exclude_columns can be different from None.
           Defaults to None.
    :param publish_schema  If True, publish the given schema name to Schema Registry Server, according to an Avro schema
           generated from the table definition, for the columns and fields implied by field_to_col_mapping, include_only_columns,
           and exclude_columns.  When true, if a schema_version is provided and the resulting version after publishing does not match,
           an exception results.
    :param schema_namespace  When publish_schema is True, the namespace for the generated schema to be restered in Schema Registry Server.
    :param column_properties  When publish_schema is True, a dict containing string properties for columns specifying string properties
            implying particular Avro type mappings for them.   In particular, column X of BigDecimal type should specify string properties
            'x.precision' and 'x.scale'.
    :return:  A Kafka Key or Value spec object to use in a call to produceFromTable.
    :raises:  ValueError, TypeError or Exception if arguments provided can't be processed.
    """
    if _isStr(schema):
        have_actual_schema = False
        if schema_version is None:
            schema_version = "latest"
        elif not _isStr(schema_version):
            raise TypeError(
                "argument 'schema_version' should be of str type, instead got "
                + str(schema_version) + " of type " +
                type(schema_version).__name__)
    elif isinstance(schema, _avro_schema_jtype_):
        have_actual_schema = True
        if schema_version is not None:
            raise Exception(
                "argument 'schema_version' is only expected if schema is of str type"
            )
    else:
        raise TypeError("'schema' argument expected to be of either " +
                        "str type or avro schema type, instead got " +
                        str(schema))

    if field_to_col_mapping is not None and not isinstance(
            field_to_col_mapping, dict):
        raise TypeError(
            "argument 'field_to_col_mapping' is expected to be of dict type, "
            + "instead got " + str(field_to_col_mapping) + " of type " +
            type(field_to_col_mapping).__name__)
    if column_properties is not None and not isinstance(
            column_properties, dict):
        raise TypeError(
            "argument 'column_properties' is excpected to be of dict type, " +
            "instead got " + str(column_properties) + " of type " +
            type(column_properties).__name__)
    field_to_col_mapping = _dictToMap(field_to_col_mapping)
    column_properties = _dictToProperties(column_properties)
    include_only_columns = _seqToSet(include_only_columns)
    include_only_columns = _java_type_.predicateFromSet(include_only_columns)
    exclude_columns = _seqToSet(exclude_columns)
    exclude_columns = _java_type_.predicateFromSet(exclude_columns)
    publish_schema = bool(publish_schema)
    if have_actual_schema:
        return _produce_jtype_.avroSpec(schema, field_to_col_mapping,
                                        timestamp_field, include_only_columns,
                                        exclude_columns, publish_schema,
                                        schema_namespace, column_properties)
    return _produce_jtype_.avroSpec(schema, schema_version,
                                    field_to_col_mapping, timestamp_field,
                                    include_only_columns, exclude_columns,
                                    publish_schema, schema_namespace,
                                    column_properties)
Ejemplo n.º 5
0
def consumeToTable(
        kafka_config:dict,
        topic:str,
        partitions = None,
        offsets = None,
        key = None,
        value = None,
        table_type = 'stream'
):
    """
    Consume from Kafka to a Deephaven table.

    :param kafka_config: Dictionary with properties to configure the associated kafka consumer and
        also the resulting table.  Once the table-specific properties are stripped, the result is
        passed to the org.apache.kafka.clients.consumer.KafkaConsumer constructor; pass any
        KafkaConsumer specific desired configuration here.
    :param topic: The topic name
    :param partitions: Either a sequence of integer partition numbers or the predefined constant
       ALL_PARTITIONS for all partitions.
    :param offsets: Either a dict mapping partition numbers to offset numbers, or one of the predefined constants
       ALL_PARTITIONS_SEEK_TO_BEGINNING, ALL_PARTITIONS_SEEK_TO_END or ALL_PARTITIONS_DONT_SEEK.
       If a dict, the values may be one of the predefined constants SEEK_TO_BEGINNING, SEEK_TO_END
       or DONT_SEEK.
    :param key: A specification for how to map the Key field in Kafka messages.  This should be
       the result of calling one of the methods simple, avro or json in this module,
       or None to obtain a single column specified in the kafka_config param via the 
       keys 'deephaven.key.column.name' for column name and 'deephaven.key.column.type' for
       the column type; both should have string values associated to them.
    :param value: A specification for how to map the Value field in Kafka messages.  This should be
       the result of calling one of the methods simple, avro or json in this module,
       or None to obtain a single column specified in the kafka_config param via the 
       keys 'deephaven.value.column.name' for column name and 'deephaven.value.column.type' for
       the column type; both should have string values associated to them.
    :param table_type: A string specifying the resulting table type: one of 'stream' (default), 'append',
       'stream_map' or 'append_map'.
    :return: A Deephaven live table that will update based on Kafma messages consumed for the given topic.
    :raises: ValueError or TypeError if arguments provided can't be processed.
    """

    if not _isStr(topic):
        raise ValueError("argument 'topic' has to be of str type, instead got " + topic)

    partitions = _jpy_partitions(partitions)

    if offsets is None:
        offsets = ALL_PARTITIONS_DONT_SEEK
    elif isinstance(offsets, dict):
        try:
            partitions_array = jpy.array('int', list(offsets.keys()))
            offsets_array = jpy.array('long', list(offsets.values()))
            offsets = _java_type_.partitionToOffsetFromParallelArrays(partitions_array, offsets_array)
        except Exception as e:
            raise ValueError(
                "when of type dict, keyword argument 'offsets' has to map " +
                "numeric partitions to either numeric offsets, or one of the constants { " +
                "SEEK_TO_BEGINNING, DONT_SEEK, SEEK_TO_END }," +
                "instead got offsets=" + str(offsets)
            ) from e
    elif not isinstance(offsets, jpy.JType):
        raise TypeError(
            "value " + str(offsets) + " of type " + type(offsets).__name__ +
            "  not recognized for argument 'offsets'; only str, dict like, or predefined constants allowed")

    if key is None:
        key = FROM_PROPERTIES
    if value is None:
        value = FROM_PROPERTIES
    if key is IGNORE and value is IGNORE:
        raise ValueError(
            "at least one argument for 'key' or 'value' must be different from IGNORE")

    if not _isStr(table_type):
        raise TypeError(
            "argument 'table_type' expected to be of type str, instead got " +
            str(table_type) + " of type " + type(table_type).__name__)
    table_type_enum = _jpy_table_type(table_type)

    kafka_config = _dictToProperties(kafka_config)
    return _java_type_.consumeToTable(kafka_config, topic, partitions, offsets, key, value, table_type_enum)