Ejemplo n.º 1
0
def get_table_name(name_transformer: DestinationNameTransformer, parent: str,
                   child: str, suffix: str, json_path: List[str]) -> str:
    max_length = name_transformer.get_name_max_length(
    ) - 2  # less two for the underscores
    json_path_hash = hash_json_path(json_path)
    norm_suffix = suffix if not suffix or suffix.startswith(
        "_") else f"_{suffix}"
    norm_parent = parent if not parent else name_transformer.normalize_table_name(
        parent, False, False)
    norm_child = name_transformer.normalize_table_name(child, False, False)
    min_parent_length = min(MINIMUM_PARENT_LENGTH, len(norm_parent))

    # no parent
    if not parent:
        return name_transformer.truncate_identifier_name(
            f"{norm_child}{norm_suffix}")
    # if everything fits without truncation, don't truncate anything
    elif (len(norm_parent) + len(norm_child) + len(json_path_hash) +
          len(norm_suffix)) < max_length:
        return f"{norm_parent}_{json_path_hash}_{norm_child}{norm_suffix}"
    # if everything fits except for the parent, just truncate the parent
    elif (len(norm_child) + len(json_path_hash) +
          len(norm_suffix)) < (max_length - min_parent_length):
        max_parent_length = max_length - len(norm_child) - len(
            json_path_hash) - len(norm_suffix)
        return f"{norm_parent[:max_parent_length]}_{json_path_hash}_{norm_child}{norm_suffix}"
    # otherwise first truncate parent to the minimum length and middle truncate the child
    else:
        norm_child_max_length = max_length - min_parent_length - len(
            json_path_hash) - len(norm_suffix)
        trunc_norm_child = name_transformer.truncate_identifier_name(
            norm_child, norm_child_max_length)
        return f"{norm_parent[:min_parent_length]}_{json_path_hash}_{trunc_norm_child}{norm_suffix}"
Ejemplo n.º 2
0
def get_nested_hashed_table_name(name_transformer: DestinationNameTransformer, schema: str, json_path: List[str], child: str) -> str:
    """
    In normalization code base, we often have to deal with naming for tables, combining informations from:
    - parent table: to denote where a table is extracted from (in case of nesting)
    - child table: in case of nesting, the field name or the original stream name
    - extra suffix: normalization is done in multiple transformation steps, each may need to generate separate tables,
    so we can add a suffix to distinguish the different transformation steps of a pipeline.
    - json path: in terms of parent and nested field names in order to reach the table currently being built

    All these informations should be included (if possible) in the table naming for the user to (somehow) identify and
    recognize what data is available there.
    """
    parent = "_".join(json_path[:-1])
    max_length = name_transformer.get_name_max_length()
    json_path_hash = hash_json_path([schema] + json_path)
    norm_parent = parent if not parent else name_transformer.normalize_table_name(parent, False, False)
    norm_child = name_transformer.normalize_table_name(child, False, False)
    min_parent_length = min(MINIMUM_PARENT_LENGTH, len(norm_parent))

    # no parent
    if not parent:
        raise RuntimeError("There is no nested table names without parents")
    # if everything fits without truncation, don't truncate anything
    elif (len(norm_parent) + len(json_path_hash) + len(norm_child) + 2) < max_length:
        return f"{norm_parent}_{json_path_hash}_{norm_child}"
    # if everything fits except for the parent, just truncate the parent (still guarantees parent is of length min_parent_length)
    elif (min_parent_length + len(json_path_hash) + len(norm_child) + 2) < max_length:
        max_parent_length = max_length - len(json_path_hash) - len(norm_child) - 2
        return f"{norm_parent[:max_parent_length]}_{json_path_hash}_{norm_child}"
    # otherwise first truncate parent to the minimum length and middle truncate the child too
    else:
        norm_child_max_length = max_length - len(json_path_hash) - 2 - min_parent_length
        trunc_norm_child = name_transformer.truncate_identifier_name(norm_child, norm_child_max_length)
        return f"{norm_parent[:min_parent_length]}_{json_path_hash}_{trunc_norm_child}"
Ejemplo n.º 3
0
    def build_stream_processor(
        catalog: Dict,
        json_column_name: str,
        default_schema: str,
        name_transformer: DestinationNameTransformer,
        destination_type: DestinationType,
        tables_registry: TableNameRegistry,
    ) -> List[StreamProcessor]:
        result = []
        for configured_stream in get_field(catalog, "streams", "Invalid Catalog: 'streams' is not defined in Catalog"):
            stream_config = get_field(configured_stream, "stream", "Invalid Stream: 'stream' is not defined in Catalog streams")

            # The logic here matches the logic in JdbcBufferedConsumerFactory.java.
            # Any modifications need to be reflected there and vice versa.
            schema = default_schema
            if "namespace" in stream_config:
                schema = stream_config["namespace"]

            schema_name = name_transformer.normalize_schema_name(schema, truncate=False)
            raw_schema_name = name_transformer.normalize_schema_name(f"_airbyte_{schema}", truncate=False)
            stream_name = get_field(stream_config, "name", f"Invalid Stream: 'name' is not defined in stream: {str(stream_config)}")
            raw_table_name = name_transformer.normalize_table_name(f"_airbyte_raw_{stream_name}", truncate=False)

            source_sync_mode = get_source_sync_mode(configured_stream, stream_name)
            destination_sync_mode = get_destination_sync_mode(configured_stream, stream_name)
            cursor_field = []
            primary_key = []
            if source_sync_mode.value == SyncMode.incremental.value or destination_sync_mode.value in [
                # DestinationSyncMode.upsert_dedup.value,
                DestinationSyncMode.append_dedup.value,
            ]:
                cursor_field = get_field(configured_stream, "cursor_field", f"Undefined cursor field for stream {stream_name}")
            if destination_sync_mode.value in [
                # DestinationSyncMode.upsert_dedup.value,
                DestinationSyncMode.append_dedup.value
            ]:
                primary_key = get_field(configured_stream, "primary_key", f"Undefined primary key for stream {stream_name}")

            message = f"'json_schema'.'properties' are not defined for stream {stream_name}"
            properties = get_field(get_field(stream_config, "json_schema", message), "properties", message)

            from_table = "source('{}', '{}')".format(schema_name, raw_table_name)

            stream_processor = StreamProcessor.create(
                stream_name=stream_name,
                destination_type=destination_type,
                raw_schema=raw_schema_name,
                schema=schema_name,
                source_sync_mode=source_sync_mode,
                destination_sync_mode=destination_sync_mode,
                cursor_field=cursor_field,
                primary_key=primary_key,
                json_column_name=f"'{json_column_name}'",
                properties=properties,
                tables_registry=tables_registry,
                from_table=from_table,
            )
            result.append(stream_processor)
        return result
Ejemplo n.º 4
0
def get_table_name(name_transformer: DestinationNameTransformer, parent: str,
                   child: str, suffix: str, json_path: List[str]) -> str:
    """
    In normalization code base, we often have to deal with naming for tables, combining informations from:
    - parent table: to denote where a table is extracted from (in case of nesting)
    - child table: in case of nesting, the field name or the original stream name
    - extra suffix: normalization is done in multiple transformation steps, each may need to generate separate tables,
    so we can add a suffix to distinguish the different transformation steps of a pipeline.
    - json path: in terms of parent and nested field names in order to reach the table currently being built

    All these informations should be included (if possible) in the table naming for the user to (somehow) identify and
    recognize what data is available there.
    """
    max_length = name_transformer.get_name_max_length(
    ) - 2  # less two for the underscores
    json_path_hash = hash_json_path(json_path)
    norm_suffix = suffix if not suffix or suffix.startswith(
        "_") else f"_{suffix}"
    norm_parent = parent if not parent else name_transformer.normalize_table_name(
        parent, False, False)
    norm_child = name_transformer.normalize_table_name(child, False, False)
    min_parent_length = min(MINIMUM_PARENT_LENGTH, len(norm_parent))

    # no parent
    if not parent:
        return name_transformer.truncate_identifier_name(
            f"{norm_child}{norm_suffix}")
    # if everything fits without truncation, don't truncate anything
    elif (len(norm_parent) + len(norm_child) + len(json_path_hash) +
          len(norm_suffix)) < max_length:
        return f"{norm_parent}_{json_path_hash}_{norm_child}{norm_suffix}"
    # if everything fits except for the parent, just truncate the parent
    elif (len(norm_child) + len(json_path_hash) +
          len(norm_suffix)) < (max_length - min_parent_length):
        max_parent_length = max_length - len(norm_child) - len(
            json_path_hash) - len(norm_suffix)
        return f"{norm_parent[:max_parent_length]}_{json_path_hash}_{norm_child}{norm_suffix}"
    # otherwise first truncate parent to the minimum length and middle truncate the child
    else:
        norm_child_max_length = max_length - min_parent_length - len(
            json_path_hash) - len(norm_suffix)
        trunc_norm_child = name_transformer.truncate_identifier_name(
            norm_child, norm_child_max_length)
        return f"{norm_parent[:min_parent_length]}_{json_path_hash}_{trunc_norm_child}{norm_suffix}"
Ejemplo n.º 5
0
    def build_stream_processor(
        catalog: Dict,
        json_column_name: str,
        target_schema: str,
        name_transformer: DestinationNameTransformer,
        destination_type: DestinationType,
        tables_registry: Set[str],
    ) -> List[StreamProcessor]:
        result = []
        for configured_stream in get_field(
                catalog, "streams",
                "Invalid Catalog: 'streams' is not defined in Catalog"):
            stream_config = get_field(
                configured_stream, "stream",
                "Invalid Stream: 'stream' is not defined in Catalog streams")
            schema_name = name_transformer.normalize_schema_name(target_schema)
            raw_schema_name = name_transformer.normalize_schema_name(
                f"_airbyte_{target_schema}", truncate=False)
            stream_name = get_field(
                stream_config, "name",
                f"Invalid Stream: 'name' is not defined in stream: {str(stream_config)}"
            )
            raw_table_name = name_transformer.normalize_table_name(
                f"_airbyte_raw_{stream_name}", truncate=False)

            message = f"'json_schema'.'properties' are not defined for stream {stream_name}"
            properties = get_field(
                get_field(stream_config, "json_schema", message), "properties",
                message)

            from_table = "source('{}', '{}')".format(schema_name,
                                                     raw_table_name)

            # Check properties
            if not properties:
                raise EOFError(
                    "Invalid Catalog: Unexpected empty properties in catalog")

            stream_processor = StreamProcessor.create(
                stream_name=stream_name,
                destination_type=destination_type,
                raw_schema=raw_schema_name,
                schema=schema_name,
                json_column_name=f"'{json_column_name}'",
                properties=properties,
                tables_registry=tables_registry,
                from_table=from_table,
            )
            result.append(stream_processor)
        return result
Ejemplo n.º 6
0
    def build_stream_processor(
        catalog: Dict,
        json_column_name: str,
        target_schema: str,
        name_transformer: DestinationNameTransformer,
        destination_type: DestinationType,
        tables_registry: Set[str],
    ) -> List[StreamProcessor]:
        result = []
        for configured_stream in get_field(
                catalog, "streams",
                "Invalid Catalog: 'streams' is not defined in Catalog"):
            stream_config = get_field(
                configured_stream, "stream",
                "Invalid Stream: 'stream' is not defined in Catalog streams")
            schema_name = name_transformer.normalize_schema_name(target_schema)
            raw_schema_name = name_transformer.normalize_schema_name(
                f"_airbyte_{target_schema}", truncate=False)
            stream_name = get_field(
                stream_config, "name",
                f"Invalid Stream: 'name' is not defined in stream: {str(stream_config)}"
            )
            raw_table_name = name_transformer.normalize_table_name(
                f"_airbyte_raw_{stream_name}", truncate=False)

            source_sync_mode = get_source_sync_mode(configured_stream,
                                                    stream_name)
            destination_sync_mode = get_destination_sync_mode(
                configured_stream, stream_name)
            cursor_field = []
            primary_key = []
            if source_sync_mode.value == SyncMode.incremental.value or destination_sync_mode.value in [
                    # DestinationSyncMode.upsert_dedup.value,
                    DestinationSyncMode.append_dedup.value,
            ]:
                cursor_field = get_field(
                    configured_stream, "cursor_field",
                    f"Undefined cursor field for stream {stream_name}")
            if destination_sync_mode.value in [
                    # DestinationSyncMode.upsert_dedup.value,
                    DestinationSyncMode.append_dedup.value
            ]:
                primary_key = get_field(
                    configured_stream, "primary_key",
                    f"Undefined primary key for stream {stream_name}")

            message = f"'json_schema'.'properties' are not defined for stream {stream_name}"
            properties = get_field(
                get_field(stream_config, "json_schema", message), "properties",
                message)

            from_table = "source('{}', '{}')".format(schema_name,
                                                     raw_table_name)

            # Check properties
            if not properties:
                raise EOFError(
                    "Invalid Catalog: Unexpected empty properties in catalog")

            stream_processor = StreamProcessor.create(
                stream_name=stream_name,
                destination_type=destination_type,
                raw_schema=raw_schema_name,
                schema=schema_name,
                source_sync_mode=source_sync_mode,
                destination_sync_mode=destination_sync_mode,
                cursor_field=cursor_field,
                primary_key=primary_key,
                json_column_name=f"'{json_column_name}'",
                properties=properties,
                tables_registry=tables_registry,
                from_table=from_table,
            )
            result.append(stream_processor)
        return result
Ejemplo n.º 7
0
    def build_stream_processor(
        catalog: Dict,
        json_column_name: str,
        default_schema: str,
        name_transformer: DestinationNameTransformer,
        destination_type: DestinationType,
        tables_registry: TableNameRegistry,
    ) -> List[StreamProcessor]:
        result = []
        for configured_stream in get_field(catalog, "streams", "Invalid Catalog: 'streams' is not defined in Catalog"):
            stream_config = get_field(configured_stream, "stream", "Invalid Stream: 'stream' is not defined in Catalog streams")

            # The logic here matches the logic in JdbcBufferedConsumerFactory.java.
            # Any modifications need to be reflected there and vice versa.
            schema = default_schema
            if "namespace" in stream_config:
                schema = stream_config["namespace"]

            schema_name = name_transformer.normalize_schema_name(schema, truncate=False)
            if destination_type == DestinationType.ORACLE:
                quote_in_parenthesis = re.compile(r"quote\((.*)\)")
                raw_schema_name = name_transformer.normalize_schema_name(schema, truncate=False)
                if not quote_in_parenthesis.findall(json_column_name):
                    json_column_name = name_transformer.normalize_column_name(json_column_name, in_jinja=True)
            else:
                column_inside_single_quote = re.compile(r"\'(.*)\'")
                raw_schema_name = name_transformer.normalize_schema_name(f"_airbyte_{schema}", truncate=False)
                if not column_inside_single_quote.findall(json_column_name):
                    json_column_name = f"'{json_column_name}'"

            stream_name = get_field(stream_config, "name", f"Invalid Stream: 'name' is not defined in stream: {str(stream_config)}")
            # MySQL table names need to be manually truncated, because it does not do it automatically
            truncate = destination_type == DestinationType.MYSQL
            raw_table_name = name_transformer.normalize_table_name(f"_airbyte_raw_{stream_name}", truncate=truncate)

            source_sync_mode = get_source_sync_mode(configured_stream, stream_name)
            destination_sync_mode = get_destination_sync_mode(configured_stream, stream_name)
            cursor_field = []
            primary_key = []
            if source_sync_mode.value == SyncMode.incremental.value or destination_sync_mode.value in [
                # DestinationSyncMode.upsert_dedup.value,
                DestinationSyncMode.append_dedup.value,
            ]:
                cursor_field = get_field(configured_stream, "cursor_field", f"Undefined cursor field for stream {stream_name}")
            if destination_sync_mode.value in [
                # DestinationSyncMode.upsert_dedup.value,
                DestinationSyncMode.append_dedup.value
            ]:
                primary_key = get_field(configured_stream, "primary_key", f"Undefined primary key for stream {stream_name}")

            message = f"'json_schema'.'properties' are not defined for stream {stream_name}"
            properties = get_field(get_field(stream_config, "json_schema", message), "properties", message)

            from_table = dbt_macro.Source(schema_name, raw_table_name)

            stream_processor = StreamProcessor.create(
                stream_name=stream_name,
                destination_type=destination_type,
                raw_schema=raw_schema_name,
                default_schema=default_schema,
                schema=schema_name,
                source_sync_mode=source_sync_mode,
                destination_sync_mode=destination_sync_mode,
                cursor_field=cursor_field,
                primary_key=primary_key,
                json_column_name=json_column_name,
                properties=properties,
                tables_registry=tables_registry,
                from_table=from_table,
            )
            result.append(stream_processor)
        return result