def get_table_name(name_transformer: DestinationNameTransformer, parent: str, child: str, suffix: str, json_path: List[str]) -> str: max_length = name_transformer.get_name_max_length( ) - 2 # less two for the underscores json_path_hash = hash_json_path(json_path) norm_suffix = suffix if not suffix or suffix.startswith( "_") else f"_{suffix}" norm_parent = parent if not parent else name_transformer.normalize_table_name( parent, False, False) norm_child = name_transformer.normalize_table_name(child, False, False) min_parent_length = min(MINIMUM_PARENT_LENGTH, len(norm_parent)) # no parent if not parent: return name_transformer.truncate_identifier_name( f"{norm_child}{norm_suffix}") # if everything fits without truncation, don't truncate anything elif (len(norm_parent) + len(norm_child) + len(json_path_hash) + len(norm_suffix)) < max_length: return f"{norm_parent}_{json_path_hash}_{norm_child}{norm_suffix}" # if everything fits except for the parent, just truncate the parent elif (len(norm_child) + len(json_path_hash) + len(norm_suffix)) < (max_length - min_parent_length): max_parent_length = max_length - len(norm_child) - len( json_path_hash) - len(norm_suffix) return f"{norm_parent[:max_parent_length]}_{json_path_hash}_{norm_child}{norm_suffix}" # otherwise first truncate parent to the minimum length and middle truncate the child else: norm_child_max_length = max_length - min_parent_length - len( json_path_hash) - len(norm_suffix) trunc_norm_child = name_transformer.truncate_identifier_name( norm_child, norm_child_max_length) return f"{norm_parent[:min_parent_length]}_{json_path_hash}_{trunc_norm_child}{norm_suffix}"
def get_nested_hashed_table_name(name_transformer: DestinationNameTransformer, schema: str, json_path: List[str], child: str) -> str: """ In normalization code base, we often have to deal with naming for tables, combining informations from: - parent table: to denote where a table is extracted from (in case of nesting) - child table: in case of nesting, the field name or the original stream name - extra suffix: normalization is done in multiple transformation steps, each may need to generate separate tables, so we can add a suffix to distinguish the different transformation steps of a pipeline. - json path: in terms of parent and nested field names in order to reach the table currently being built All these informations should be included (if possible) in the table naming for the user to (somehow) identify and recognize what data is available there. """ parent = "_".join(json_path[:-1]) max_length = name_transformer.get_name_max_length() json_path_hash = hash_json_path([schema] + json_path) norm_parent = parent if not parent else name_transformer.normalize_table_name(parent, False, False) norm_child = name_transformer.normalize_table_name(child, False, False) min_parent_length = min(MINIMUM_PARENT_LENGTH, len(norm_parent)) # no parent if not parent: raise RuntimeError("There is no nested table names without parents") # if everything fits without truncation, don't truncate anything elif (len(norm_parent) + len(json_path_hash) + len(norm_child) + 2) < max_length: return f"{norm_parent}_{json_path_hash}_{norm_child}" # if everything fits except for the parent, just truncate the parent (still guarantees parent is of length min_parent_length) elif (min_parent_length + len(json_path_hash) + len(norm_child) + 2) < max_length: max_parent_length = max_length - len(json_path_hash) - len(norm_child) - 2 return f"{norm_parent[:max_parent_length]}_{json_path_hash}_{norm_child}" # otherwise first truncate parent to the minimum length and middle truncate the child too else: norm_child_max_length = max_length - len(json_path_hash) - 2 - min_parent_length trunc_norm_child = name_transformer.truncate_identifier_name(norm_child, norm_child_max_length) return f"{norm_parent[:min_parent_length]}_{json_path_hash}_{trunc_norm_child}"
def build_stream_processor( catalog: Dict, json_column_name: str, default_schema: str, name_transformer: DestinationNameTransformer, destination_type: DestinationType, tables_registry: TableNameRegistry, ) -> List[StreamProcessor]: result = [] for configured_stream in get_field(catalog, "streams", "Invalid Catalog: 'streams' is not defined in Catalog"): stream_config = get_field(configured_stream, "stream", "Invalid Stream: 'stream' is not defined in Catalog streams") # The logic here matches the logic in JdbcBufferedConsumerFactory.java. # Any modifications need to be reflected there and vice versa. schema = default_schema if "namespace" in stream_config: schema = stream_config["namespace"] schema_name = name_transformer.normalize_schema_name(schema, truncate=False) raw_schema_name = name_transformer.normalize_schema_name(f"_airbyte_{schema}", truncate=False) stream_name = get_field(stream_config, "name", f"Invalid Stream: 'name' is not defined in stream: {str(stream_config)}") raw_table_name = name_transformer.normalize_table_name(f"_airbyte_raw_{stream_name}", truncate=False) source_sync_mode = get_source_sync_mode(configured_stream, stream_name) destination_sync_mode = get_destination_sync_mode(configured_stream, stream_name) cursor_field = [] primary_key = [] if source_sync_mode.value == SyncMode.incremental.value or destination_sync_mode.value in [ # DestinationSyncMode.upsert_dedup.value, DestinationSyncMode.append_dedup.value, ]: cursor_field = get_field(configured_stream, "cursor_field", f"Undefined cursor field for stream {stream_name}") if destination_sync_mode.value in [ # DestinationSyncMode.upsert_dedup.value, DestinationSyncMode.append_dedup.value ]: primary_key = get_field(configured_stream, "primary_key", f"Undefined primary key for stream {stream_name}") message = f"'json_schema'.'properties' are not defined for stream {stream_name}" properties = get_field(get_field(stream_config, "json_schema", message), "properties", message) from_table = "source('{}', '{}')".format(schema_name, raw_table_name) stream_processor = StreamProcessor.create( stream_name=stream_name, destination_type=destination_type, raw_schema=raw_schema_name, schema=schema_name, source_sync_mode=source_sync_mode, destination_sync_mode=destination_sync_mode, cursor_field=cursor_field, primary_key=primary_key, json_column_name=f"'{json_column_name}'", properties=properties, tables_registry=tables_registry, from_table=from_table, ) result.append(stream_processor) return result
def get_table_name(name_transformer: DestinationNameTransformer, parent: str, child: str, suffix: str, json_path: List[str]) -> str: """ In normalization code base, we often have to deal with naming for tables, combining informations from: - parent table: to denote where a table is extracted from (in case of nesting) - child table: in case of nesting, the field name or the original stream name - extra suffix: normalization is done in multiple transformation steps, each may need to generate separate tables, so we can add a suffix to distinguish the different transformation steps of a pipeline. - json path: in terms of parent and nested field names in order to reach the table currently being built All these informations should be included (if possible) in the table naming for the user to (somehow) identify and recognize what data is available there. """ max_length = name_transformer.get_name_max_length( ) - 2 # less two for the underscores json_path_hash = hash_json_path(json_path) norm_suffix = suffix if not suffix or suffix.startswith( "_") else f"_{suffix}" norm_parent = parent if not parent else name_transformer.normalize_table_name( parent, False, False) norm_child = name_transformer.normalize_table_name(child, False, False) min_parent_length = min(MINIMUM_PARENT_LENGTH, len(norm_parent)) # no parent if not parent: return name_transformer.truncate_identifier_name( f"{norm_child}{norm_suffix}") # if everything fits without truncation, don't truncate anything elif (len(norm_parent) + len(norm_child) + len(json_path_hash) + len(norm_suffix)) < max_length: return f"{norm_parent}_{json_path_hash}_{norm_child}{norm_suffix}" # if everything fits except for the parent, just truncate the parent elif (len(norm_child) + len(json_path_hash) + len(norm_suffix)) < (max_length - min_parent_length): max_parent_length = max_length - len(norm_child) - len( json_path_hash) - len(norm_suffix) return f"{norm_parent[:max_parent_length]}_{json_path_hash}_{norm_child}{norm_suffix}" # otherwise first truncate parent to the minimum length and middle truncate the child else: norm_child_max_length = max_length - min_parent_length - len( json_path_hash) - len(norm_suffix) trunc_norm_child = name_transformer.truncate_identifier_name( norm_child, norm_child_max_length) return f"{norm_parent[:min_parent_length]}_{json_path_hash}_{trunc_norm_child}{norm_suffix}"
def build_stream_processor( catalog: Dict, json_column_name: str, target_schema: str, name_transformer: DestinationNameTransformer, destination_type: DestinationType, tables_registry: Set[str], ) -> List[StreamProcessor]: result = [] for configured_stream in get_field( catalog, "streams", "Invalid Catalog: 'streams' is not defined in Catalog"): stream_config = get_field( configured_stream, "stream", "Invalid Stream: 'stream' is not defined in Catalog streams") schema_name = name_transformer.normalize_schema_name(target_schema) raw_schema_name = name_transformer.normalize_schema_name( f"_airbyte_{target_schema}", truncate=False) stream_name = get_field( stream_config, "name", f"Invalid Stream: 'name' is not defined in stream: {str(stream_config)}" ) raw_table_name = name_transformer.normalize_table_name( f"_airbyte_raw_{stream_name}", truncate=False) message = f"'json_schema'.'properties' are not defined for stream {stream_name}" properties = get_field( get_field(stream_config, "json_schema", message), "properties", message) from_table = "source('{}', '{}')".format(schema_name, raw_table_name) # Check properties if not properties: raise EOFError( "Invalid Catalog: Unexpected empty properties in catalog") stream_processor = StreamProcessor.create( stream_name=stream_name, destination_type=destination_type, raw_schema=raw_schema_name, schema=schema_name, json_column_name=f"'{json_column_name}'", properties=properties, tables_registry=tables_registry, from_table=from_table, ) result.append(stream_processor) return result
def build_stream_processor( catalog: Dict, json_column_name: str, target_schema: str, name_transformer: DestinationNameTransformer, destination_type: DestinationType, tables_registry: Set[str], ) -> List[StreamProcessor]: result = [] for configured_stream in get_field( catalog, "streams", "Invalid Catalog: 'streams' is not defined in Catalog"): stream_config = get_field( configured_stream, "stream", "Invalid Stream: 'stream' is not defined in Catalog streams") schema_name = name_transformer.normalize_schema_name(target_schema) raw_schema_name = name_transformer.normalize_schema_name( f"_airbyte_{target_schema}", truncate=False) stream_name = get_field( stream_config, "name", f"Invalid Stream: 'name' is not defined in stream: {str(stream_config)}" ) raw_table_name = name_transformer.normalize_table_name( f"_airbyte_raw_{stream_name}", truncate=False) source_sync_mode = get_source_sync_mode(configured_stream, stream_name) destination_sync_mode = get_destination_sync_mode( configured_stream, stream_name) cursor_field = [] primary_key = [] if source_sync_mode.value == SyncMode.incremental.value or destination_sync_mode.value in [ # DestinationSyncMode.upsert_dedup.value, DestinationSyncMode.append_dedup.value, ]: cursor_field = get_field( configured_stream, "cursor_field", f"Undefined cursor field for stream {stream_name}") if destination_sync_mode.value in [ # DestinationSyncMode.upsert_dedup.value, DestinationSyncMode.append_dedup.value ]: primary_key = get_field( configured_stream, "primary_key", f"Undefined primary key for stream {stream_name}") message = f"'json_schema'.'properties' are not defined for stream {stream_name}" properties = get_field( get_field(stream_config, "json_schema", message), "properties", message) from_table = "source('{}', '{}')".format(schema_name, raw_table_name) # Check properties if not properties: raise EOFError( "Invalid Catalog: Unexpected empty properties in catalog") stream_processor = StreamProcessor.create( stream_name=stream_name, destination_type=destination_type, raw_schema=raw_schema_name, schema=schema_name, source_sync_mode=source_sync_mode, destination_sync_mode=destination_sync_mode, cursor_field=cursor_field, primary_key=primary_key, json_column_name=f"'{json_column_name}'", properties=properties, tables_registry=tables_registry, from_table=from_table, ) result.append(stream_processor) return result
def build_stream_processor( catalog: Dict, json_column_name: str, default_schema: str, name_transformer: DestinationNameTransformer, destination_type: DestinationType, tables_registry: TableNameRegistry, ) -> List[StreamProcessor]: result = [] for configured_stream in get_field(catalog, "streams", "Invalid Catalog: 'streams' is not defined in Catalog"): stream_config = get_field(configured_stream, "stream", "Invalid Stream: 'stream' is not defined in Catalog streams") # The logic here matches the logic in JdbcBufferedConsumerFactory.java. # Any modifications need to be reflected there and vice versa. schema = default_schema if "namespace" in stream_config: schema = stream_config["namespace"] schema_name = name_transformer.normalize_schema_name(schema, truncate=False) if destination_type == DestinationType.ORACLE: quote_in_parenthesis = re.compile(r"quote\((.*)\)") raw_schema_name = name_transformer.normalize_schema_name(schema, truncate=False) if not quote_in_parenthesis.findall(json_column_name): json_column_name = name_transformer.normalize_column_name(json_column_name, in_jinja=True) else: column_inside_single_quote = re.compile(r"\'(.*)\'") raw_schema_name = name_transformer.normalize_schema_name(f"_airbyte_{schema}", truncate=False) if not column_inside_single_quote.findall(json_column_name): json_column_name = f"'{json_column_name}'" stream_name = get_field(stream_config, "name", f"Invalid Stream: 'name' is not defined in stream: {str(stream_config)}") # MySQL table names need to be manually truncated, because it does not do it automatically truncate = destination_type == DestinationType.MYSQL raw_table_name = name_transformer.normalize_table_name(f"_airbyte_raw_{stream_name}", truncate=truncate) source_sync_mode = get_source_sync_mode(configured_stream, stream_name) destination_sync_mode = get_destination_sync_mode(configured_stream, stream_name) cursor_field = [] primary_key = [] if source_sync_mode.value == SyncMode.incremental.value or destination_sync_mode.value in [ # DestinationSyncMode.upsert_dedup.value, DestinationSyncMode.append_dedup.value, ]: cursor_field = get_field(configured_stream, "cursor_field", f"Undefined cursor field for stream {stream_name}") if destination_sync_mode.value in [ # DestinationSyncMode.upsert_dedup.value, DestinationSyncMode.append_dedup.value ]: primary_key = get_field(configured_stream, "primary_key", f"Undefined primary key for stream {stream_name}") message = f"'json_schema'.'properties' are not defined for stream {stream_name}" properties = get_field(get_field(stream_config, "json_schema", message), "properties", message) from_table = dbt_macro.Source(schema_name, raw_table_name) stream_processor = StreamProcessor.create( stream_name=stream_name, destination_type=destination_type, raw_schema=raw_schema_name, default_schema=default_schema, schema=schema_name, source_sync_mode=source_sync_mode, destination_sync_mode=destination_sync_mode, cursor_field=cursor_field, primary_key=primary_key, json_column_name=json_column_name, properties=properties, tables_registry=tables_registry, from_table=from_table, ) result.append(stream_processor) return result