def get_table_name(name_transformer: DestinationNameTransformer, parent: str, child: str, suffix: str, json_path: List[str]) -> str: max_length = name_transformer.get_name_max_length( ) - 2 # less two for the underscores json_path_hash = hash_json_path(json_path) norm_suffix = suffix if not suffix or suffix.startswith( "_") else f"_{suffix}" norm_parent = parent if not parent else name_transformer.normalize_table_name( parent, False, False) norm_child = name_transformer.normalize_table_name(child, False, False) min_parent_length = min(MINIMUM_PARENT_LENGTH, len(norm_parent)) # no parent if not parent: return name_transformer.truncate_identifier_name( f"{norm_child}{norm_suffix}") # if everything fits without truncation, don't truncate anything elif (len(norm_parent) + len(norm_child) + len(json_path_hash) + len(norm_suffix)) < max_length: return f"{norm_parent}_{json_path_hash}_{norm_child}{norm_suffix}" # if everything fits except for the parent, just truncate the parent elif (len(norm_child) + len(json_path_hash) + len(norm_suffix)) < (max_length - min_parent_length): max_parent_length = max_length - len(norm_child) - len( json_path_hash) - len(norm_suffix) return f"{norm_parent[:max_parent_length]}_{json_path_hash}_{norm_child}{norm_suffix}" # otherwise first truncate parent to the minimum length and middle truncate the child else: norm_child_max_length = max_length - min_parent_length - len( json_path_hash) - len(norm_suffix) trunc_norm_child = name_transformer.truncate_identifier_name( norm_child, norm_child_max_length) return f"{norm_parent[:min_parent_length]}_{json_path_hash}_{trunc_norm_child}{norm_suffix}"
def test_normalize_column_name(input_str: str, destination_type: str, expected: str, expected_in_jinja: str): t = DestinationType.from_string(destination_type) assert DestinationNameTransformer(t).normalize_column_name( input_str, in_jinja=False) == expected assert DestinationNameTransformer(t).normalize_column_name( input_str, in_jinja=True) == expected_in_jinja
def get_nested_hashed_table_name(name_transformer: DestinationNameTransformer, schema: str, json_path: List[str], child: str) -> str: """ In normalization code base, we often have to deal with naming for tables, combining informations from: - parent table: to denote where a table is extracted from (in case of nesting) - child table: in case of nesting, the field name or the original stream name - extra suffix: normalization is done in multiple transformation steps, each may need to generate separate tables, so we can add a suffix to distinguish the different transformation steps of a pipeline. - json path: in terms of parent and nested field names in order to reach the table currently being built All these informations should be included (if possible) in the table naming for the user to (somehow) identify and recognize what data is available there. """ parent = "_".join(json_path[:-1]) max_length = name_transformer.get_name_max_length() json_path_hash = hash_json_path([schema] + json_path) norm_parent = parent if not parent else name_transformer.normalize_table_name(parent, False, False) norm_child = name_transformer.normalize_table_name(child, False, False) min_parent_length = min(MINIMUM_PARENT_LENGTH, len(norm_parent)) # no parent if not parent: raise RuntimeError("There is no nested table names without parents") # if everything fits without truncation, don't truncate anything elif (len(norm_parent) + len(json_path_hash) + len(norm_child) + 2) < max_length: return f"{norm_parent}_{json_path_hash}_{norm_child}" # if everything fits except for the parent, just truncate the parent (still guarantees parent is of length min_parent_length) elif (min_parent_length + len(json_path_hash) + len(norm_child) + 2) < max_length: max_parent_length = max_length - len(json_path_hash) - len(norm_child) - 2 return f"{norm_parent[:max_parent_length]}_{json_path_hash}_{norm_child}" # otherwise first truncate parent to the minimum length and middle truncate the child too else: norm_child_max_length = max_length - len(json_path_hash) - 2 - min_parent_length trunc_norm_child = name_transformer.truncate_identifier_name(norm_child, norm_child_max_length) return f"{norm_parent[:min_parent_length]}_{json_path_hash}_{trunc_norm_child}"
def build_stream_processor( catalog: Dict, json_column_name: str, default_schema: str, name_transformer: DestinationNameTransformer, destination_type: DestinationType, tables_registry: TableNameRegistry, ) -> List[StreamProcessor]: result = [] for configured_stream in get_field(catalog, "streams", "Invalid Catalog: 'streams' is not defined in Catalog"): stream_config = get_field(configured_stream, "stream", "Invalid Stream: 'stream' is not defined in Catalog streams") # The logic here matches the logic in JdbcBufferedConsumerFactory.java. # Any modifications need to be reflected there and vice versa. schema = default_schema if "namespace" in stream_config: schema = stream_config["namespace"] schema_name = name_transformer.normalize_schema_name(schema, truncate=False) raw_schema_name = name_transformer.normalize_schema_name(f"_airbyte_{schema}", truncate=False) stream_name = get_field(stream_config, "name", f"Invalid Stream: 'name' is not defined in stream: {str(stream_config)}") raw_table_name = name_transformer.normalize_table_name(f"_airbyte_raw_{stream_name}", truncate=False) source_sync_mode = get_source_sync_mode(configured_stream, stream_name) destination_sync_mode = get_destination_sync_mode(configured_stream, stream_name) cursor_field = [] primary_key = [] if source_sync_mode.value == SyncMode.incremental.value or destination_sync_mode.value in [ # DestinationSyncMode.upsert_dedup.value, DestinationSyncMode.append_dedup.value, ]: cursor_field = get_field(configured_stream, "cursor_field", f"Undefined cursor field for stream {stream_name}") if destination_sync_mode.value in [ # DestinationSyncMode.upsert_dedup.value, DestinationSyncMode.append_dedup.value ]: primary_key = get_field(configured_stream, "primary_key", f"Undefined primary key for stream {stream_name}") message = f"'json_schema'.'properties' are not defined for stream {stream_name}" properties = get_field(get_field(stream_config, "json_schema", message), "properties", message) from_table = "source('{}', '{}')".format(schema_name, raw_table_name) stream_processor = StreamProcessor.create( stream_name=stream_name, destination_type=destination_type, raw_schema=raw_schema_name, schema=schema_name, source_sync_mode=source_sync_mode, destination_sync_mode=destination_sync_mode, cursor_field=cursor_field, primary_key=primary_key, json_column_name=f"'{json_column_name}'", properties=properties, tables_registry=tables_registry, from_table=from_table, ) result.append(stream_processor) return result
def test_normalize_name(input_str: str, destination_type: str, expected: str, expected_column: str): t = DestinationType.from_string(destination_type) assert DestinationNameTransformer(t).normalize_schema_name( input_str) == expected assert DestinationNameTransformer(t).normalize_table_name( input_str) == expected assert DestinationNameTransformer(t).normalize_column_name( input_str) == expected_column
def build_stream_processor( catalog: Dict, json_column_name: str, target_schema: str, name_transformer: DestinationNameTransformer, destination_type: DestinationType, tables_registry: Set[str], ) -> List[StreamProcessor]: result = [] for configured_stream in get_field( catalog, "streams", "Invalid Catalog: 'streams' is not defined in Catalog"): stream_config = get_field( configured_stream, "stream", "Invalid Stream: 'stream' is not defined in Catalog streams") schema_name = name_transformer.normalize_schema_name(target_schema) raw_schema_name = name_transformer.normalize_schema_name( f"_airbyte_{target_schema}", truncate=False) stream_name = get_field( stream_config, "name", f"Invalid Stream: 'name' is not defined in stream: {str(stream_config)}" ) raw_table_name = name_transformer.normalize_table_name( f"_airbyte_raw_{stream_name}", truncate=False) message = f"'json_schema'.'properties' are not defined for stream {stream_name}" properties = get_field( get_field(stream_config, "json_schema", message), "properties", message) from_table = "source('{}', '{}')".format(schema_name, raw_table_name) # Check properties if not properties: raise EOFError( "Invalid Catalog: Unexpected empty properties in catalog") stream_processor = StreamProcessor.create( stream_name=stream_name, destination_type=destination_type, raw_schema=raw_schema_name, schema=schema_name, json_column_name=f"'{json_column_name}'", properties=properties, tables_registry=tables_registry, from_table=from_table, ) result.append(stream_processor) return result
def test_truncate_identifier(input_str: str, expected: str): """ Rules about truncations, for example for both of these strings which are too long for the postgres 64 limit: - `Aaaa_Bbbb_Cccc_Dddd_Eeee_Ffff_Gggg_Hhhh_Iiii` - `Aaaa_Bbbb_Cccc_Dddd_Eeee_a_very_long_name_Ffff_Gggg_Hhhh_Iiii` Deciding on how to truncate (in the middle) are being verified in these tests. In this instance, both strings ends up as:`Aaaa_Bbbb_Cccc_Dddd___e_Ffff_Gggg_Hhhh_Iiii` and can potentially cause a collision in table names. Note that dealing with such collisions is not part of `destination_name_transformer` but of the `stream_processor`. """ name_transformer = DestinationNameTransformer(DestinationType.POSTGRES) print(f"Truncating from #{len(input_str)} to #{len(expected)}") assert name_transformer.truncate_identifier_name(input_str) == expected
def __init__( self, stream_name: str, destination_type: DestinationType, raw_schema: str, schema: str, json_column_name: str, properties: Dict, tables_registry: Set[str], from_table: str, ): """ See StreamProcessor.create() """ self.stream_name: str = stream_name self.destination_type: DestinationType = destination_type self.raw_schema: str = raw_schema self.schema: str = schema self.json_column_name: str = json_column_name self.properties: Dict = properties self.tables_registry: Set[str] = tables_registry self.from_table: str = from_table self.name_transformer: DestinationNameTransformer = DestinationNameTransformer( destination_type) self.json_path: List[str] = [stream_name] self.final_table_name: str = "" self.sql_outputs: Dict[str, str] = {} self.local_registry: Set[str] = set() self.parent: Optional["StreamProcessor"] = None self.is_nested_array: bool = False
def test_get_nested_hashed_table_name(json_path: List[str], expected_postgres: str, expected_bigquery: str): """ Checks how to generate a unique name with strategies of combining all fields into a single table name for the user to (somehow) identify and recognize what data is available in there. A set of complicated rules are done in order to choose what parts to truncate or what to leave and handle name collisions. """ child = json_path[-1] postgres_name_transformer = DestinationNameTransformer(DestinationType.POSTGRES) actual_postgres_name = get_nested_hashed_table_name(postgres_name_transformer, "schema", json_path, child) assert actual_postgres_name == expected_postgres assert len(actual_postgres_name) <= 43 # explicitly check for our max postgres length in case tests are changed in the future bigquery_name_transformer = DestinationNameTransformer(DestinationType.BIGQUERY) actual_bigquery_name = get_nested_hashed_table_name(bigquery_name_transformer, "schema", json_path, child) assert actual_bigquery_name == expected_bigquery
def __init__(self, output_directory: str, destination_type: DestinationType): """ @param output_directory is the path to the directory where this processor should write the resulting SQL files (DBT models) @param destination_type is the destination type of warehouse """ self.output_directory: str = output_directory self.destination_type: DestinationType = destination_type self.name_transformer: DestinationNameTransformer = DestinationNameTransformer(destination_type)
def test_resolve_names(destination_type: DestinationType, catalog_file: str): """ For a given catalog.json and destination, multiple cases can occur where naming becomes tricky. (especially since some destination like postgres have a very low limit to identifiers length of 64 characters) In case of nested objects/arrays in a stream, names can drag on to very long names. Tests are built here using resources files as follow: - `<name of source or test types>_catalog.json`: input catalog.json, typically as what source would provide. For example Hubspot, Stripe and Facebook catalog.json contains some level of nesting. (here, nested_catalog.json is an extracted smaller sample of stream/properties from the facebook catalog) - `<name of source or test types>_expected_names.json`: list of expected table names For the expected json files, it is possible to specialize the file to a certain destination. So if for example, the resources folder contains these two expected files: - edge_cases_catalog_expected_names.json - edge_cases_catalog_expected_postgres_names.json Then the test will be using the first edge_cases_catalog_expected_names.json except for Postgres destination where the expected table names will come from edge_cases_catalog_expected_postgres_names.json The content of the expected_*.json files are the serialization of the stream_processor.tables_registry.registry """ integration_type = destination_type.value tables_registry = TableNameRegistry(destination_type) catalog = read_json(f"resources/{catalog_file}.json") # process top level stream_processors = CatalogProcessor.build_stream_processor( catalog=catalog, json_column_name="'json_column_name_test'", default_schema="schema_test", name_transformer=DestinationNameTransformer(destination_type), destination_type=destination_type, tables_registry=tables_registry, ) for stream_processor in stream_processors: # Check properties if not stream_processor.properties: raise EOFError("Invalid Catalog: Unexpected empty properties in catalog") stream_processor.collect_table_names() for conflict in tables_registry.resolve_names(): print( f"WARN: Resolving conflict: {conflict.schema}.{conflict.table_name_conflict} " f"from '{'.'.join(conflict.json_path)}' into {conflict.table_name_resolved}" ) apply_function = identity if DestinationType.SNOWFLAKE.value == destination_type.value: apply_function = str.upper elif DestinationType.REDSHIFT.value == destination_type.value: apply_function = str.lower if os.path.exists(f"resources/{catalog_file}_expected_{integration_type.lower()}_names.json"): expected_names = read_json(f"resources/{catalog_file}_expected_{integration_type.lower()}_names.json", apply_function) else: expected_names = read_json(f"resources/{catalog_file}_expected_names.json", apply_function) assert tables_registry.to_dict(apply_function) == expected_names
def test_get_table_name(root_table: str, base_table_name: str, suffix: str, expected: str): name_transformer = DestinationNameTransformer(DestinationType.POSTGRES) name = get_table_name(name_transformer, root_table, base_table_name, suffix, ["json", "path"]) assert name == expected assert len( name ) <= 43 # explicitly check for our max postgres length in case tests are changed in the future
def get_table_name(name_transformer: DestinationNameTransformer, parent: str, child: str, suffix: str, json_path: List[str]) -> str: """ In normalization code base, we often have to deal with naming for tables, combining informations from: - parent table: to denote where a table is extracted from (in case of nesting) - child table: in case of nesting, the field name or the original stream name - extra suffix: normalization is done in multiple transformation steps, each may need to generate separate tables, so we can add a suffix to distinguish the different transformation steps of a pipeline. - json path: in terms of parent and nested field names in order to reach the table currently being built All these informations should be included (if possible) in the table naming for the user to (somehow) identify and recognize what data is available there. """ max_length = name_transformer.get_name_max_length( ) - 2 # less two for the underscores json_path_hash = hash_json_path(json_path) norm_suffix = suffix if not suffix or suffix.startswith( "_") else f"_{suffix}" norm_parent = parent if not parent else name_transformer.normalize_table_name( parent, False, False) norm_child = name_transformer.normalize_table_name(child, False, False) min_parent_length = min(MINIMUM_PARENT_LENGTH, len(norm_parent)) # no parent if not parent: return name_transformer.truncate_identifier_name( f"{norm_child}{norm_suffix}") # if everything fits without truncation, don't truncate anything elif (len(norm_parent) + len(norm_child) + len(json_path_hash) + len(norm_suffix)) < max_length: return f"{norm_parent}_{json_path_hash}_{norm_child}{norm_suffix}" # if everything fits except for the parent, just truncate the parent elif (len(norm_child) + len(json_path_hash) + len(norm_suffix)) < (max_length - min_parent_length): max_parent_length = max_length - len(norm_child) - len( json_path_hash) - len(norm_suffix) return f"{norm_parent[:max_parent_length]}_{json_path_hash}_{norm_child}{norm_suffix}" # otherwise first truncate parent to the minimum length and middle truncate the child else: norm_child_max_length = max_length - min_parent_length - len( json_path_hash) - len(norm_suffix) trunc_norm_child = name_transformer.truncate_identifier_name( norm_child, norm_child_max_length) return f"{norm_parent[:min_parent_length]}_{json_path_hash}_{trunc_norm_child}{norm_suffix}"
def __init__(self, destination_type: DestinationType): """ @param destination_type is the destination type of warehouse """ self.destination_type: DestinationType = destination_type self.name_transformer: DestinationNameTransformer = DestinationNameTransformer(destination_type) # Simple XXX registry are collecting "simple" XXX names (with potential collisions) self.simple_file_registry: NormalizedFilesRegistry = NormalizedFilesRegistry() self.simple_table_registry: NormalizedTablesRegistry = NormalizedTablesRegistry(self.name_transformer) # Registry is the collision free (resolved) mapping of schema json_path of the stream to the names that should be used self.registry: Dict[str, ResolvedNameMetadata] = {}
def test_get_table_name(root_table: str, base_table_name: str, suffix: str, expected: str): """ - parent table: referred to as root table - child table: referred to as base table. - extra suffix: normalization steps used as suffix to decompose pipeline into multi steps. - json path: in terms of parent and nested field names in order to reach the table currently being built See documentation of get_table_name method for more details. But this test check the strategies of combining all those fields into a single table name for the user to (somehow) identify and recognize what data is available in there. A set of complicated rules are done in order to choose what parts to truncate or what to leave and handle name collisions. """ name_transformer = DestinationNameTransformer(DestinationType.POSTGRES) name = get_table_name(name_transformer, root_table, base_table_name, suffix, ["json", "path"]) assert name == expected assert len( name ) <= 43 # explicitly check for our max postgres length in case tests are changed in the future
def __init__( self, stream_name: str, destination_type: DestinationType, raw_schema: str, schema: str, source_sync_mode: SyncMode, destination_sync_mode: DestinationSyncMode, cursor_field: List[str], primary_key: List[List[str]], json_column_name: str, properties: Dict, tables_registry: Dict[str, Dict[str, str]], from_table: str, ): """ See StreamProcessor.create() """ self.stream_name: str = stream_name self.destination_type: DestinationType = destination_type self.raw_schema: str = raw_schema self.schema: str = schema self.source_sync_mode: SyncMode = source_sync_mode self.destination_sync_mode: DestinationSyncMode = destination_sync_mode self.cursor_field: List[str] = cursor_field self.primary_key: List[List[str]] = primary_key self.json_column_name: str = json_column_name self.properties: Dict = properties self.tables_registry: Dict[str, Dict[str, str]] = tables_registry self.from_table: str = from_table self.name_transformer: DestinationNameTransformer = DestinationNameTransformer( destination_type) self.json_path: List[str] = [stream_name] self.final_table_name: str = "" self.sql_outputs: Dict[str, str] = {} self.local_registry: Dict[str, Dict[str, str]] = {} self.parent: Optional["StreamProcessor"] = None self.is_nested_array: bool = False
def test_needs_quote(input_str: str, destination_type: str, expected: bool): name_transformer = DestinationNameTransformer( DestinationType.from_string(destination_type)) assert name_transformer.needs_quotes(input_str) == expected
def build_stream_processor( catalog: Dict, json_column_name: str, default_schema: str, name_transformer: DestinationNameTransformer, destination_type: DestinationType, tables_registry: TableNameRegistry, ) -> List[StreamProcessor]: result = [] for configured_stream in get_field(catalog, "streams", "Invalid Catalog: 'streams' is not defined in Catalog"): stream_config = get_field(configured_stream, "stream", "Invalid Stream: 'stream' is not defined in Catalog streams") # The logic here matches the logic in JdbcBufferedConsumerFactory.java. # Any modifications need to be reflected there and vice versa. schema = default_schema if "namespace" in stream_config: schema = stream_config["namespace"] schema_name = name_transformer.normalize_schema_name(schema, truncate=False) if destination_type == DestinationType.ORACLE: quote_in_parenthesis = re.compile(r"quote\((.*)\)") raw_schema_name = name_transformer.normalize_schema_name(schema, truncate=False) if not quote_in_parenthesis.findall(json_column_name): json_column_name = name_transformer.normalize_column_name(json_column_name, in_jinja=True) else: column_inside_single_quote = re.compile(r"\'(.*)\'") raw_schema_name = name_transformer.normalize_schema_name(f"_airbyte_{schema}", truncate=False) if not column_inside_single_quote.findall(json_column_name): json_column_name = f"'{json_column_name}'" stream_name = get_field(stream_config, "name", f"Invalid Stream: 'name' is not defined in stream: {str(stream_config)}") # MySQL table names need to be manually truncated, because it does not do it automatically truncate = destination_type == DestinationType.MYSQL raw_table_name = name_transformer.normalize_table_name(f"_airbyte_raw_{stream_name}", truncate=truncate) source_sync_mode = get_source_sync_mode(configured_stream, stream_name) destination_sync_mode = get_destination_sync_mode(configured_stream, stream_name) cursor_field = [] primary_key = [] if source_sync_mode.value == SyncMode.incremental.value or destination_sync_mode.value in [ # DestinationSyncMode.upsert_dedup.value, DestinationSyncMode.append_dedup.value, ]: cursor_field = get_field(configured_stream, "cursor_field", f"Undefined cursor field for stream {stream_name}") if destination_sync_mode.value in [ # DestinationSyncMode.upsert_dedup.value, DestinationSyncMode.append_dedup.value ]: primary_key = get_field(configured_stream, "primary_key", f"Undefined primary key for stream {stream_name}") message = f"'json_schema'.'properties' are not defined for stream {stream_name}" properties = get_field(get_field(stream_config, "json_schema", message), "properties", message) from_table = dbt_macro.Source(schema_name, raw_table_name) stream_processor = StreamProcessor.create( stream_name=stream_name, destination_type=destination_type, raw_schema=raw_schema_name, default_schema=default_schema, schema=schema_name, source_sync_mode=source_sync_mode, destination_sync_mode=destination_sync_mode, cursor_field=cursor_field, primary_key=primary_key, json_column_name=json_column_name, properties=properties, tables_registry=tables_registry, from_table=from_table, ) result.append(stream_processor) return result
def build_stream_processor( catalog: Dict, json_column_name: str, target_schema: str, name_transformer: DestinationNameTransformer, destination_type: DestinationType, tables_registry: Set[str], ) -> List[StreamProcessor]: result = [] for configured_stream in get_field( catalog, "streams", "Invalid Catalog: 'streams' is not defined in Catalog"): stream_config = get_field( configured_stream, "stream", "Invalid Stream: 'stream' is not defined in Catalog streams") schema_name = name_transformer.normalize_schema_name(target_schema) raw_schema_name = name_transformer.normalize_schema_name( f"_airbyte_{target_schema}", truncate=False) stream_name = get_field( stream_config, "name", f"Invalid Stream: 'name' is not defined in stream: {str(stream_config)}" ) raw_table_name = name_transformer.normalize_table_name( f"_airbyte_raw_{stream_name}", truncate=False) source_sync_mode = get_source_sync_mode(configured_stream, stream_name) destination_sync_mode = get_destination_sync_mode( configured_stream, stream_name) cursor_field = [] primary_key = [] if source_sync_mode.value == SyncMode.incremental.value or destination_sync_mode.value in [ # DestinationSyncMode.upsert_dedup.value, DestinationSyncMode.append_dedup.value, ]: cursor_field = get_field( configured_stream, "cursor_field", f"Undefined cursor field for stream {stream_name}") if destination_sync_mode.value in [ # DestinationSyncMode.upsert_dedup.value, DestinationSyncMode.append_dedup.value ]: primary_key = get_field( configured_stream, "primary_key", f"Undefined primary key for stream {stream_name}") message = f"'json_schema'.'properties' are not defined for stream {stream_name}" properties = get_field( get_field(stream_config, "json_schema", message), "properties", message) from_table = "source('{}', '{}')".format(schema_name, raw_table_name) # Check properties if not properties: raise EOFError( "Invalid Catalog: Unexpected empty properties in catalog") stream_processor = StreamProcessor.create( stream_name=stream_name, destination_type=destination_type, raw_schema=raw_schema_name, schema=schema_name, source_sync_mode=source_sync_mode, destination_sync_mode=destination_sync_mode, cursor_field=cursor_field, primary_key=primary_key, json_column_name=f"'{json_column_name}'", properties=properties, tables_registry=tables_registry, from_table=from_table, ) result.append(stream_processor) return result
def test_stream_processor_tables_naming(integration_type: str, catalog_file: str): """ For a given catalog.json and destination, multiple cases can occur where naming becomes tricky. (especially since some destination like postgres have a very low limit to identifiers length of 64 characters) In case of nested objects/arrays in a stream, names can drag on to very long names. Tests are built here using resources files as follow: - `<name of source or test types>_catalog.json`: input catalog.json, typically as what source would provide. For example Hubspot, Stripe and Facebook catalog.json contains some level of nesting. (here, nested_catalog.json is an extracted smaller sample of stream/properties from the facebook catalog) - `<name of source or test types>_expected_top_level.json`: list of expected table names for the top level stream names - `<name of source or test types>_expected_nested.json`: list of expected table names for nested objects, extracted to their own and separate table names For the expected json files, it is possible to specialize the file to a certain destination. So if for example, the resources folder contains these two expected files: - edge_cases_catalog_expected_top_level.json - edge_cases_catalog_expected_top_level_postgres.json Then the test will be using the first edge_cases_catalog_expected_top_level.json except for Postgres destination where the expected table names will come from edge_cases_catalog_expected_top_level_postgres.json The content of the expected_*.json files are the serialization of the stream_processor.tables_registry (mapping per schema to all tables in that schema, mapping to the final filename) """ destination_type = DestinationType.from_string(integration_type) tables_registry = {} substreams = [] catalog = read_json(f"resources/{catalog_file}.json") # process top level for stream_processor in CatalogProcessor.build_stream_processor( catalog=catalog, json_column_name="'json_column_name_test'", default_schema="schema_test", name_transformer=DestinationNameTransformer(destination_type), destination_type=destination_type, tables_registry=tables_registry, ): nested_processors = stream_processor.process() tables_registry = add_table_to_registry(tables_registry, stream_processor) if nested_processors and len(nested_processors) > 0: substreams += nested_processors apply_function = None if DestinationType.SNOWFLAKE.value == destination_type.value: apply_function = str.upper elif DestinationType.REDSHIFT.value == destination_type.value: apply_function = str.lower if os.path.exists( f"resources/{catalog_file}_expected_top_level_{integration_type.lower()}.json" ): expected_top_level = read_json( f"resources/{catalog_file}_expected_top_level_{integration_type.lower()}.json", apply_function) else: expected_top_level = read_json( f"resources/{catalog_file}_expected_top_level.json", apply_function) assert tables_registry == expected_top_level # process substreams while substreams: children = substreams substreams = [] for substream in children: substream.tables_registry = tables_registry nested_processors = substream.process() tables_registry = add_table_to_registry(tables_registry, substream) if nested_processors: substreams += nested_processors apply_function = None if DestinationType.SNOWFLAKE.value == destination_type.value: apply_function = str.upper elif DestinationType.REDSHIFT.value == destination_type.value: apply_function = str.lower if os.path.exists( f"resources/{catalog_file}_expected_nested_{integration_type.lower()}.json" ): expected_nested = read_json( f"resources/{catalog_file}_expected_nested_{integration_type.lower()}.json", apply_function) else: expected_nested = read_json( f"resources/{catalog_file}_expected_nested.json", apply_function) # remove expected top level tables from tables_registry for schema in expected_top_level: for table in expected_top_level[schema]: del tables_registry[schema][table] if len(tables_registry[schema]) == 0: del tables_registry[schema] assert tables_registry == expected_nested
def test_stream_processor_tables_naming(integration_type: str, catalog_file: str, setup_test_path): destination_type = DestinationType.from_string(integration_type) tables_registry = set() substreams = [] catalog = read_json(f"resources/{catalog_file}.json") # process top level for stream_processor in CatalogProcessor.build_stream_processor( catalog=catalog, json_column_name="'json_column_name_test'", target_schema="schema_test", name_transformer=DestinationNameTransformer(destination_type), destination_type=destination_type, tables_registry=tables_registry, ): nested_processors = stream_processor.process() for table in stream_processor.local_registry: found_sql_output = False for sql_output in stream_processor.sql_outputs: if re.match(r".*/" + table + ".sql", sql_output) is not None: found_sql_output = True break assert found_sql_output add_table_to_registry(tables_registry, stream_processor) if nested_processors and len(nested_processors) > 0: substreams += nested_processors if os.path.exists( f"resources/{catalog_file}_expected_top_level_{integration_type.lower()}.json" ): expected_top_level = set( read_json( f"resources/{catalog_file}_expected_top_level_{integration_type.lower()}.json" )["tables"]) else: expected_top_level = set( read_json(f"resources/{catalog_file}_expected_top_level.json") ["tables"]) if DestinationType.SNOWFLAKE.value == destination_type.value: expected_top_level = { table.upper() for table in expected_top_level } elif DestinationType.REDSHIFT.value == destination_type.value: expected_top_level = { table.lower() for table in expected_top_level } assert tables_registry == expected_top_level # process substreams while substreams: children = substreams substreams = [] for substream in children: substream.tables_registry = tables_registry nested_processors = substream.process() add_table_to_registry(tables_registry, substream) if nested_processors: substreams += nested_processors if os.path.exists( f"resources/{catalog_file}_expected_nested_{integration_type.lower()}.json" ): expected_nested = set( read_json( f"resources/{catalog_file}_expected_nested_{integration_type.lower()}.json" )["tables"]) else: expected_nested = set( read_json(f"resources/{catalog_file}_expected_nested.json") ["tables"]) if DestinationType.SNOWFLAKE.value == destination_type.value: expected_nested = {table.upper() for table in expected_nested} elif DestinationType.REDSHIFT.value == destination_type.value: expected_nested = {table.lower() for table in expected_nested} assert (tables_registry - expected_top_level) == expected_nested
def test_stream_processor_tables_naming(integration_type: str, catalog_file: str, setup_test_path): destination_type = DestinationType.from_string(integration_type) tables_registry = {} substreams = [] catalog = read_json(f"resources/{catalog_file}.json") # process top level for stream_processor in CatalogProcessor.build_stream_processor( catalog=catalog, json_column_name="'json_column_name_test'", default_schema="schema_test", name_transformer=DestinationNameTransformer(destination_type), destination_type=destination_type, tables_registry=tables_registry, ): nested_processors = stream_processor.process() for schema in stream_processor.local_registry: for table in stream_processor.local_registry[schema]: found_sql_output = False for sql_output in stream_processor.sql_outputs: file_name = f"{schema}_{table}" if len( file_name ) > stream_processor.name_transformer.get_name_max_length( ): file_name = stream_processor.name_transformer.truncate_identifier_name( input_name=file_name) if re.match(r".*/" + file_name + ".sql", sql_output) is not None: found_sql_output = True break assert found_sql_output add_table_to_registry(tables_registry, stream_processor) if nested_processors and len(nested_processors) > 0: substreams += nested_processors if os.path.exists( f"resources/{catalog_file}_expected_top_level_{integration_type.lower()}.json" ): expected_top_level = set( read_json( f"resources/{catalog_file}_expected_top_level_{integration_type.lower()}.json" )["tables"]) else: expected_top_level = set( read_json(f"resources/{catalog_file}_expected_top_level.json") ["tables"]) if DestinationType.SNOWFLAKE.value == destination_type.value: expected_top_level = { table.upper() for table in expected_top_level } elif DestinationType.REDSHIFT.value == destination_type.value: expected_top_level = { table.lower() for table in expected_top_level } # process substreams while substreams: children = substreams substreams = [] for substream in children: substream.tables_registry = tables_registry nested_processors = substream.process() add_table_to_registry(tables_registry, substream) if nested_processors: substreams += nested_processors if os.path.exists( f"resources/{catalog_file}_expected_nested_{integration_type.lower()}.json" ): expected_nested = set( read_json( f"resources/{catalog_file}_expected_nested_{integration_type.lower()}.json" )["tables"]) else: expected_nested = set( read_json(f"resources/{catalog_file}_expected_nested.json") ["tables"]) if DestinationType.SNOWFLAKE.value == destination_type.value: expected_nested = {table.upper() for table in expected_nested} elif DestinationType.REDSHIFT.value == destination_type.value: expected_nested = {table.lower() for table in expected_nested} # TODO(davin): Instead of unwrapping all tables, rewrite this test so tables are compared based on schema. all_tables = set() for schema in tables_registry: for tables in tables_registry[schema]: all_tables.add(tables) assert (all_tables - expected_top_level) == expected_nested
def test_truncate_identifier(input_str: str, expected: str): name_transformer = DestinationNameTransformer(DestinationType.POSTGRES) print(f"Truncating from #{len(input_str)} to #{len(expected)}") assert name_transformer.truncate_identifier_name(input_str) == expected