def test_normalize_column_name(input_str: str, destination_type: str, expected: str, expected_in_jinja: str): t = DestinationType.from_string(destination_type) assert DestinationNameTransformer(t).normalize_column_name( input_str, in_jinja=False) == expected assert DestinationNameTransformer(t).normalize_column_name( input_str, in_jinja=True) == expected_in_jinja
def test_normalize_name(input_str: str, destination_type: str, expected: str, expected_column: str): t = DestinationType.from_string(destination_type) assert DestinationNameTransformer(t).normalize_schema_name( input_str) == expected assert DestinationNameTransformer(t).normalize_table_name( input_str) == expected assert DestinationNameTransformer(t).normalize_column_name( input_str) == expected_column
def process_catalog(self) -> None: destination_type = DestinationType.from_string(self.config["integration_type"]) schema = self.config["schema"] output = self.config["output_path"] json_col = self.config["json_column"] processor = CatalogProcessor(output_directory=output, destination_type=destination_type) for catalog_file in self.config["catalog"]: print(f"Processing {catalog_file}...") processor.process(catalog_file=catalog_file, json_column_name=json_col, default_schema=schema)
def test_destination_failure_over_limits(integration_type: str, column_count: int, expected_exception_message: str, setup_test_path): destination_type = DestinationType.from_string(integration_type) if destination_type.value not in dbt_test_utils.get_test_targets(): pytest.skip( f"Destinations {destination_type} is not in NORMALIZATION_TEST_TARGET env variable" ) run_test(destination_type, column_count, expected_exception_message)
def get_test_targets() -> List[str]: """ Returns a list of destinations to run tests on. if the environment variable NORMALIZATION_TEST_TARGET is set with a comma separated list of destination names, then the tests are run only on that subsets of destinations Otherwise tests are run against all destinations """ if os.getenv(NORMALIZATION_TEST_TARGET): target_str = os.getenv(NORMALIZATION_TEST_TARGET) return [d.value for d in {DestinationType.from_string(s.strip()) for s in target_str.split(",")}] else: return [d.value for d in DestinationType]
def test_normalization(integration_type: str, test_resource_name: str, setup_test_path): print("Testing normalization") destination_type = DestinationType.from_string(integration_type) # Create the test folder with dbt project and appropriate destination settings to run integration tests from test_root_dir = setup_test_dir(integration_type, test_resource_name) destination_config = generate_profile_yaml_file(destination_type, test_root_dir) # Use destination connector to create _airbyte_raw_* tables to use as input for the test assert setup_input_raw_data(integration_type, test_resource_name, test_root_dir, destination_config) # Normalization step generate_dbt_models(destination_type, test_resource_name, test_root_dir) dbt_run(test_root_dir) # Run checks on Tests results dbt_test(destination_type, test_resource_name, test_root_dir) check_outputs(destination_type, test_resource_name, test_root_dir)
def test_needs_quote(input_str: str, destination_type: str, expected: bool): name_transformer = DestinationNameTransformer( DestinationType.from_string(destination_type)) assert name_transformer.needs_quotes(input_str) == expected
def test_stream_processor_tables_naming(integration_type: str, catalog_file: str, setup_test_path): destination_type = DestinationType.from_string(integration_type) tables_registry = set() substreams = [] catalog = read_json(f"resources/{catalog_file}.json") # process top level for stream_processor in CatalogProcessor.build_stream_processor( catalog=catalog, json_column_name="'json_column_name_test'", target_schema="schema_test", name_transformer=DestinationNameTransformer(destination_type), destination_type=destination_type, tables_registry=tables_registry, ): nested_processors = stream_processor.process() for table in stream_processor.local_registry: found_sql_output = False for sql_output in stream_processor.sql_outputs: if re.match(r".*/" + table + ".sql", sql_output) is not None: found_sql_output = True break assert found_sql_output add_table_to_registry(tables_registry, stream_processor) if nested_processors and len(nested_processors) > 0: substreams += nested_processors if os.path.exists( f"resources/{catalog_file}_expected_top_level_{integration_type.lower()}.json" ): expected_top_level = set( read_json( f"resources/{catalog_file}_expected_top_level_{integration_type.lower()}.json" )["tables"]) else: expected_top_level = set( read_json(f"resources/{catalog_file}_expected_top_level.json") ["tables"]) if DestinationType.SNOWFLAKE.value == destination_type.value: expected_top_level = { table.upper() for table in expected_top_level } elif DestinationType.REDSHIFT.value == destination_type.value: expected_top_level = { table.lower() for table in expected_top_level } assert tables_registry == expected_top_level # process substreams while substreams: children = substreams substreams = [] for substream in children: substream.tables_registry = tables_registry nested_processors = substream.process() add_table_to_registry(tables_registry, substream) if nested_processors: substreams += nested_processors if os.path.exists( f"resources/{catalog_file}_expected_nested_{integration_type.lower()}.json" ): expected_nested = set( read_json( f"resources/{catalog_file}_expected_nested_{integration_type.lower()}.json" )["tables"]) else: expected_nested = set( read_json(f"resources/{catalog_file}_expected_nested.json") ["tables"]) if DestinationType.SNOWFLAKE.value == destination_type.value: expected_nested = {table.upper() for table in expected_nested} elif DestinationType.REDSHIFT.value == destination_type.value: expected_nested = {table.lower() for table in expected_nested} assert (tables_registry - expected_top_level) == expected_nested
def test_stream_processor_tables_naming(integration_type: str, catalog_file: str, setup_test_path): destination_type = DestinationType.from_string(integration_type) tables_registry = {} substreams = [] catalog = read_json(f"resources/{catalog_file}.json") # process top level for stream_processor in CatalogProcessor.build_stream_processor( catalog=catalog, json_column_name="'json_column_name_test'", default_schema="schema_test", name_transformer=DestinationNameTransformer(destination_type), destination_type=destination_type, tables_registry=tables_registry, ): nested_processors = stream_processor.process() for schema in stream_processor.local_registry: for table in stream_processor.local_registry[schema]: found_sql_output = False for sql_output in stream_processor.sql_outputs: file_name = f"{schema}_{table}" if len( file_name ) > stream_processor.name_transformer.get_name_max_length( ): file_name = stream_processor.name_transformer.truncate_identifier_name( input_name=file_name) if re.match(r".*/" + file_name + ".sql", sql_output) is not None: found_sql_output = True break assert found_sql_output add_table_to_registry(tables_registry, stream_processor) if nested_processors and len(nested_processors) > 0: substreams += nested_processors if os.path.exists( f"resources/{catalog_file}_expected_top_level_{integration_type.lower()}.json" ): expected_top_level = set( read_json( f"resources/{catalog_file}_expected_top_level_{integration_type.lower()}.json" )["tables"]) else: expected_top_level = set( read_json(f"resources/{catalog_file}_expected_top_level.json") ["tables"]) if DestinationType.SNOWFLAKE.value == destination_type.value: expected_top_level = { table.upper() for table in expected_top_level } elif DestinationType.REDSHIFT.value == destination_type.value: expected_top_level = { table.lower() for table in expected_top_level } # process substreams while substreams: children = substreams substreams = [] for substream in children: substream.tables_registry = tables_registry nested_processors = substream.process() add_table_to_registry(tables_registry, substream) if nested_processors: substreams += nested_processors if os.path.exists( f"resources/{catalog_file}_expected_nested_{integration_type.lower()}.json" ): expected_nested = set( read_json( f"resources/{catalog_file}_expected_nested_{integration_type.lower()}.json" )["tables"]) else: expected_nested = set( read_json(f"resources/{catalog_file}_expected_nested.json") ["tables"]) if DestinationType.SNOWFLAKE.value == destination_type.value: expected_nested = {table.upper() for table in expected_nested} elif DestinationType.REDSHIFT.value == destination_type.value: expected_nested = {table.lower() for table in expected_nested} # TODO(davin): Instead of unwrapping all tables, rewrite this test so tables are compared based on schema. all_tables = set() for schema in tables_registry: for tables in tables_registry[schema]: all_tables.add(tables) assert (all_tables - expected_top_level) == expected_nested
def test_stream_processor_tables_naming(integration_type: str, catalog_file: str): """ For a given catalog.json and destination, multiple cases can occur where naming becomes tricky. (especially since some destination like postgres have a very low limit to identifiers length of 64 characters) In case of nested objects/arrays in a stream, names can drag on to very long names. Tests are built here using resources files as follow: - `<name of source or test types>_catalog.json`: input catalog.json, typically as what source would provide. For example Hubspot, Stripe and Facebook catalog.json contains some level of nesting. (here, nested_catalog.json is an extracted smaller sample of stream/properties from the facebook catalog) - `<name of source or test types>_expected_top_level.json`: list of expected table names for the top level stream names - `<name of source or test types>_expected_nested.json`: list of expected table names for nested objects, extracted to their own and separate table names For the expected json files, it is possible to specialize the file to a certain destination. So if for example, the resources folder contains these two expected files: - edge_cases_catalog_expected_top_level.json - edge_cases_catalog_expected_top_level_postgres.json Then the test will be using the first edge_cases_catalog_expected_top_level.json except for Postgres destination where the expected table names will come from edge_cases_catalog_expected_top_level_postgres.json The content of the expected_*.json files are the serialization of the stream_processor.tables_registry (mapping per schema to all tables in that schema, mapping to the final filename) """ destination_type = DestinationType.from_string(integration_type) tables_registry = {} substreams = [] catalog = read_json(f"resources/{catalog_file}.json") # process top level for stream_processor in CatalogProcessor.build_stream_processor( catalog=catalog, json_column_name="'json_column_name_test'", default_schema="schema_test", name_transformer=DestinationNameTransformer(destination_type), destination_type=destination_type, tables_registry=tables_registry, ): nested_processors = stream_processor.process() tables_registry = add_table_to_registry(tables_registry, stream_processor) if nested_processors and len(nested_processors) > 0: substreams += nested_processors apply_function = None if DestinationType.SNOWFLAKE.value == destination_type.value: apply_function = str.upper elif DestinationType.REDSHIFT.value == destination_type.value: apply_function = str.lower if os.path.exists( f"resources/{catalog_file}_expected_top_level_{integration_type.lower()}.json" ): expected_top_level = read_json( f"resources/{catalog_file}_expected_top_level_{integration_type.lower()}.json", apply_function) else: expected_top_level = read_json( f"resources/{catalog_file}_expected_top_level.json", apply_function) assert tables_registry == expected_top_level # process substreams while substreams: children = substreams substreams = [] for substream in children: substream.tables_registry = tables_registry nested_processors = substream.process() tables_registry = add_table_to_registry(tables_registry, substream) if nested_processors: substreams += nested_processors apply_function = None if DestinationType.SNOWFLAKE.value == destination_type.value: apply_function = str.upper elif DestinationType.REDSHIFT.value == destination_type.value: apply_function = str.lower if os.path.exists( f"resources/{catalog_file}_expected_nested_{integration_type.lower()}.json" ): expected_nested = read_json( f"resources/{catalog_file}_expected_nested_{integration_type.lower()}.json", apply_function) else: expected_nested = read_json( f"resources/{catalog_file}_expected_nested.json", apply_function) # remove expected top level tables from tables_registry for schema in expected_top_level: for table in expected_top_level[schema]: del tables_registry[schema][table] if len(tables_registry[schema]) == 0: del tables_registry[schema] assert tables_registry == expected_nested
def test_destination_failure_over_limits(integration_type: str, column_count: int, expected_exception_message: str, setup_test_path): run_test(DestinationType.from_string(integration_type), column_count, expected_exception_message)