Beispiel #1
0
def generate_dbt_models(destination_type: DestinationType, test_resource_name: str, test_root_dir: str):
    """
    This is the normalization step generating dbt models files from the destination_catalog.json taken as input.
    """
    catalog_processor = CatalogProcessor(os.path.join(test_root_dir, "models", "generated"), destination_type)
    catalog_processor.process(
        os.path.join("resources", test_resource_name, "data_input", "catalog.json"), "_airbyte_data", dbt_test_utils.target_schema
    )
Beispiel #2
0
 def process_catalog(self) -> None:
     destination_type = DestinationType.from_string(self.config["integration_type"])
     schema = self.config["schema"]
     output = self.config["output_path"]
     json_col = self.config["json_column"]
     processor = CatalogProcessor(output_directory=output, destination_type=destination_type)
     for catalog_file in self.config["catalog"]:
         print(f"Processing {catalog_file}...")
         processor.process(catalog_file=catalog_file, json_column_name=json_col, default_schema=schema)
Beispiel #3
0
def generate_dbt_models(destination_type: DestinationType, test_root_dir: str,
                        column_count: int):
    """
    This is the normalization step generating dbt models files from the destination_catalog.json taken as input.
    """
    output_directory = os.path.join(test_root_dir, "models", "generated")
    shutil.rmtree(output_directory, ignore_errors=True)
    catalog_processor = CatalogProcessor(output_directory, destination_type)
    catalog_config = {
        "streams": [{
            "stream": {
                "name":
                dbt_test_utils.generate_random_string(
                    f"stream_with_{column_count}_columns"),
                "json_schema": {
                    "type": ["null", "object"],
                    "properties": {},
                },
                "supported_sync_modes": ["incremental"],
                "source_defined_cursor":
                True,
                "default_cursor_field": [],
            },
            "sync_mode": "incremental",
            "cursor_field": [],
            "destination_sync_mode": "overwrite",
        }]
    }
    if column_count == 1:
        catalog_config["streams"][0]["stream"]["json_schema"]["properties"][
            "_airbyte_id"] = {
                "type": "integer"
            }
    else:
        for column in [
                dbt_test_utils.random_string(5) for _ in range(column_count)
        ]:
            catalog_config["streams"][0]["stream"]["json_schema"][
                "properties"][column] = {
                    "type": "string"
                }
    catalog = os.path.join(test_root_dir, "catalog.json")
    with open(catalog, "w") as fh:
        fh.write(json.dumps(catalog_config))
    catalog_processor.process(catalog, "_airbyte_data",
                              dbt_test_utils.target_schema)
def test_resolve_names(destination_type: DestinationType, catalog_file: str):
    """
    For a given catalog.json and destination, multiple cases can occur where naming becomes tricky.
    (especially since some destination like postgres have a very low limit to identifiers length of 64 characters)

    In case of nested objects/arrays in a stream, names can drag on to very long names.
    Tests are built here using resources files as follow:
    - `<name of source or test types>_catalog.json`:
        input catalog.json, typically as what source would provide.
        For example Hubspot, Stripe and Facebook catalog.json contains some level of nesting.
        (here, nested_catalog.json is an extracted smaller sample of stream/properties from the facebook catalog)
    - `<name of source or test types>_expected_names.json`:
        list of expected table names

    For the expected json files, it is possible to specialize the file to a certain destination.
    So if for example, the resources folder contains these two expected files:
        - edge_cases_catalog_expected_names.json
        - edge_cases_catalog_expected_postgres_names.json
    Then the test will be using the first edge_cases_catalog_expected_names.json except for
    Postgres destination where the expected table names will come from edge_cases_catalog_expected_postgres_names.json

    The content of the expected_*.json files are the serialization of the stream_processor.tables_registry.registry
    """
    integration_type = destination_type.value
    tables_registry = TableNameRegistry(destination_type)

    catalog = read_json(f"resources/{catalog_file}.json")

    # process top level
    stream_processors = CatalogProcessor.build_stream_processor(
        catalog=catalog,
        json_column_name="'json_column_name_test'",
        default_schema="schema_test",
        name_transformer=DestinationNameTransformer(destination_type),
        destination_type=destination_type,
        tables_registry=tables_registry,
    )
    for stream_processor in stream_processors:
        # Check properties
        if not stream_processor.properties:
            raise EOFError("Invalid Catalog: Unexpected empty properties in catalog")
        stream_processor.collect_table_names()
    for conflict in tables_registry.resolve_names():
        print(
            f"WARN: Resolving conflict: {conflict.schema}.{conflict.table_name_conflict} "
            f"from '{'.'.join(conflict.json_path)}' into {conflict.table_name_resolved}"
        )
    apply_function = identity
    if DestinationType.SNOWFLAKE.value == destination_type.value:
        apply_function = str.upper
    elif DestinationType.REDSHIFT.value == destination_type.value:
        apply_function = str.lower
    if os.path.exists(f"resources/{catalog_file}_expected_{integration_type.lower()}_names.json"):
        expected_names = read_json(f"resources/{catalog_file}_expected_{integration_type.lower()}_names.json", apply_function)
    else:
        expected_names = read_json(f"resources/{catalog_file}_expected_names.json", apply_function)

    assert tables_registry.to_dict(apply_function) == expected_names
def test_stream_processor_tables_naming(integration_type: str,
                                        catalog_file: str, setup_test_path):
    destination_type = DestinationType.from_string(integration_type)
    tables_registry = set()

    substreams = []
    catalog = read_json(f"resources/{catalog_file}.json")

    # process top level
    for stream_processor in CatalogProcessor.build_stream_processor(
            catalog=catalog,
            json_column_name="'json_column_name_test'",
            target_schema="schema_test",
            name_transformer=DestinationNameTransformer(destination_type),
            destination_type=destination_type,
            tables_registry=tables_registry,
    ):
        nested_processors = stream_processor.process()
        for table in stream_processor.local_registry:
            found_sql_output = False
            for sql_output in stream_processor.sql_outputs:
                if re.match(r".*/" + table + ".sql", sql_output) is not None:
                    found_sql_output = True
                    break
            assert found_sql_output
        add_table_to_registry(tables_registry, stream_processor)
        if nested_processors and len(nested_processors) > 0:
            substreams += nested_processors

    if os.path.exists(
            f"resources/{catalog_file}_expected_top_level_{integration_type.lower()}.json"
    ):
        expected_top_level = set(
            read_json(
                f"resources/{catalog_file}_expected_top_level_{integration_type.lower()}.json"
            )["tables"])
    else:
        expected_top_level = set(
            read_json(f"resources/{catalog_file}_expected_top_level.json")
            ["tables"])
        if DestinationType.SNOWFLAKE.value == destination_type.value:
            expected_top_level = {
                table.upper()
                for table in expected_top_level
            }
        elif DestinationType.REDSHIFT.value == destination_type.value:
            expected_top_level = {
                table.lower()
                for table in expected_top_level
            }
    assert tables_registry == expected_top_level

    # process substreams
    while substreams:
        children = substreams
        substreams = []
        for substream in children:
            substream.tables_registry = tables_registry
            nested_processors = substream.process()
            add_table_to_registry(tables_registry, substream)
            if nested_processors:
                substreams += nested_processors

    if os.path.exists(
            f"resources/{catalog_file}_expected_nested_{integration_type.lower()}.json"
    ):
        expected_nested = set(
            read_json(
                f"resources/{catalog_file}_expected_nested_{integration_type.lower()}.json"
            )["tables"])
    else:
        expected_nested = set(
            read_json(f"resources/{catalog_file}_expected_nested.json")
            ["tables"])
        if DestinationType.SNOWFLAKE.value == destination_type.value:
            expected_nested = {table.upper() for table in expected_nested}
        elif DestinationType.REDSHIFT.value == destination_type.value:
            expected_nested = {table.lower() for table in expected_nested}
    assert (tables_registry - expected_top_level) == expected_nested
def test_stream_processor_tables_naming(integration_type: str,
                                        catalog_file: str, setup_test_path):
    destination_type = DestinationType.from_string(integration_type)
    tables_registry = {}

    substreams = []
    catalog = read_json(f"resources/{catalog_file}.json")

    # process top level
    for stream_processor in CatalogProcessor.build_stream_processor(
            catalog=catalog,
            json_column_name="'json_column_name_test'",
            default_schema="schema_test",
            name_transformer=DestinationNameTransformer(destination_type),
            destination_type=destination_type,
            tables_registry=tables_registry,
    ):
        nested_processors = stream_processor.process()
        for schema in stream_processor.local_registry:
            for table in stream_processor.local_registry[schema]:
                found_sql_output = False
                for sql_output in stream_processor.sql_outputs:
                    file_name = f"{schema}_{table}"
                    if len(
                            file_name
                    ) > stream_processor.name_transformer.get_name_max_length(
                    ):
                        file_name = stream_processor.name_transformer.truncate_identifier_name(
                            input_name=file_name)

                    if re.match(r".*/" + file_name + ".sql",
                                sql_output) is not None:
                        found_sql_output = True
                        break
                assert found_sql_output
        add_table_to_registry(tables_registry, stream_processor)
        if nested_processors and len(nested_processors) > 0:
            substreams += nested_processors

    if os.path.exists(
            f"resources/{catalog_file}_expected_top_level_{integration_type.lower()}.json"
    ):
        expected_top_level = set(
            read_json(
                f"resources/{catalog_file}_expected_top_level_{integration_type.lower()}.json"
            )["tables"])
    else:
        expected_top_level = set(
            read_json(f"resources/{catalog_file}_expected_top_level.json")
            ["tables"])
        if DestinationType.SNOWFLAKE.value == destination_type.value:
            expected_top_level = {
                table.upper()
                for table in expected_top_level
            }
        elif DestinationType.REDSHIFT.value == destination_type.value:
            expected_top_level = {
                table.lower()
                for table in expected_top_level
            }

    # process substreams
    while substreams:
        children = substreams
        substreams = []
        for substream in children:
            substream.tables_registry = tables_registry
            nested_processors = substream.process()
            add_table_to_registry(tables_registry, substream)
            if nested_processors:
                substreams += nested_processors

    if os.path.exists(
            f"resources/{catalog_file}_expected_nested_{integration_type.lower()}.json"
    ):
        expected_nested = set(
            read_json(
                f"resources/{catalog_file}_expected_nested_{integration_type.lower()}.json"
            )["tables"])
    else:
        expected_nested = set(
            read_json(f"resources/{catalog_file}_expected_nested.json")
            ["tables"])
        if DestinationType.SNOWFLAKE.value == destination_type.value:
            expected_nested = {table.upper() for table in expected_nested}
        elif DestinationType.REDSHIFT.value == destination_type.value:
            expected_nested = {table.lower() for table in expected_nested}

    # TODO(davin): Instead of unwrapping all tables, rewrite this test so tables are compared based on schema.
    all_tables = set()
    for schema in tables_registry:
        for tables in tables_registry[schema]:
            all_tables.add(tables)

    assert (all_tables - expected_top_level) == expected_nested
Beispiel #7
0
def test_stream_processor_tables_naming(integration_type: str,
                                        catalog_file: str):
    """
    For a given catalog.json and destination, multiple cases can occur where naming becomes tricky.
    (especially since some destination like postgres have a very low limit to identifiers length of 64 characters)

    In case of nested objects/arrays in a stream, names can drag on to very long names.
    Tests are built here using resources files as follow:
    - `<name of source or test types>_catalog.json`:
        input catalog.json, typically as what source would provide.
        For example Hubspot, Stripe and Facebook catalog.json contains some level of nesting.
        (here, nested_catalog.json is an extracted smaller sample of stream/properties from the facebook catalog)
    - `<name of source or test types>_expected_top_level.json`:
        list of expected table names for the top level stream names
    - `<name of source or test types>_expected_nested.json`:
        list of expected table names for nested objects, extracted to their own and separate table names

    For the expected json files, it is possible to specialize the file to a certain destination.
    So if for example, the resources folder contains these two expected files:
        - edge_cases_catalog_expected_top_level.json
        - edge_cases_catalog_expected_top_level_postgres.json
    Then the test will be using the first edge_cases_catalog_expected_top_level.json except for
    Postgres destination where the expected table names will come from edge_cases_catalog_expected_top_level_postgres.json

    The content of the expected_*.json files are the serialization of the stream_processor.tables_registry
    (mapping per schema to all tables in that schema, mapping to the final filename)
    """
    destination_type = DestinationType.from_string(integration_type)
    tables_registry = {}

    substreams = []
    catalog = read_json(f"resources/{catalog_file}.json")

    # process top level
    for stream_processor in CatalogProcessor.build_stream_processor(
            catalog=catalog,
            json_column_name="'json_column_name_test'",
            default_schema="schema_test",
            name_transformer=DestinationNameTransformer(destination_type),
            destination_type=destination_type,
            tables_registry=tables_registry,
    ):
        nested_processors = stream_processor.process()
        tables_registry = add_table_to_registry(tables_registry,
                                                stream_processor)
        if nested_processors and len(nested_processors) > 0:
            substreams += nested_processors

    apply_function = None
    if DestinationType.SNOWFLAKE.value == destination_type.value:
        apply_function = str.upper
    elif DestinationType.REDSHIFT.value == destination_type.value:
        apply_function = str.lower
    if os.path.exists(
            f"resources/{catalog_file}_expected_top_level_{integration_type.lower()}.json"
    ):
        expected_top_level = read_json(
            f"resources/{catalog_file}_expected_top_level_{integration_type.lower()}.json",
            apply_function)
    else:
        expected_top_level = read_json(
            f"resources/{catalog_file}_expected_top_level.json",
            apply_function)

    assert tables_registry == expected_top_level

    # process substreams
    while substreams:
        children = substreams
        substreams = []
        for substream in children:
            substream.tables_registry = tables_registry
            nested_processors = substream.process()
            tables_registry = add_table_to_registry(tables_registry, substream)
            if nested_processors:
                substreams += nested_processors

    apply_function = None
    if DestinationType.SNOWFLAKE.value == destination_type.value:
        apply_function = str.upper
    elif DestinationType.REDSHIFT.value == destination_type.value:
        apply_function = str.lower
    if os.path.exists(
            f"resources/{catalog_file}_expected_nested_{integration_type.lower()}.json"
    ):
        expected_nested = read_json(
            f"resources/{catalog_file}_expected_nested_{integration_type.lower()}.json",
            apply_function)
    else:
        expected_nested = read_json(
            f"resources/{catalog_file}_expected_nested.json", apply_function)

    # remove expected top level tables from tables_registry
    for schema in expected_top_level:
        for table in expected_top_level[schema]:
            del tables_registry[schema][table]
        if len(tables_registry[schema]) == 0:
            del tables_registry[schema]
    assert tables_registry == expected_nested