Example #1
0
def get_table_name(name_transformer: DestinationNameTransformer, parent: str,
                   child: str, suffix: str, json_path: List[str]) -> str:
    max_length = name_transformer.get_name_max_length(
    ) - 2  # less two for the underscores
    json_path_hash = hash_json_path(json_path)
    norm_suffix = suffix if not suffix or suffix.startswith(
        "_") else f"_{suffix}"
    norm_parent = parent if not parent else name_transformer.normalize_table_name(
        parent, False, False)
    norm_child = name_transformer.normalize_table_name(child, False, False)
    min_parent_length = min(MINIMUM_PARENT_LENGTH, len(norm_parent))

    # no parent
    if not parent:
        return name_transformer.truncate_identifier_name(
            f"{norm_child}{norm_suffix}")
    # if everything fits without truncation, don't truncate anything
    elif (len(norm_parent) + len(norm_child) + len(json_path_hash) +
          len(norm_suffix)) < max_length:
        return f"{norm_parent}_{json_path_hash}_{norm_child}{norm_suffix}"
    # if everything fits except for the parent, just truncate the parent
    elif (len(norm_child) + len(json_path_hash) +
          len(norm_suffix)) < (max_length - min_parent_length):
        max_parent_length = max_length - len(norm_child) - len(
            json_path_hash) - len(norm_suffix)
        return f"{norm_parent[:max_parent_length]}_{json_path_hash}_{norm_child}{norm_suffix}"
    # otherwise first truncate parent to the minimum length and middle truncate the child
    else:
        norm_child_max_length = max_length - min_parent_length - len(
            json_path_hash) - len(norm_suffix)
        trunc_norm_child = name_transformer.truncate_identifier_name(
            norm_child, norm_child_max_length)
        return f"{norm_parent[:min_parent_length]}_{json_path_hash}_{trunc_norm_child}{norm_suffix}"
Example #2
0
def test_normalize_column_name(input_str: str, destination_type: str,
                               expected: str, expected_in_jinja: str):
    t = DestinationType.from_string(destination_type)
    assert DestinationNameTransformer(t).normalize_column_name(
        input_str, in_jinja=False) == expected
    assert DestinationNameTransformer(t).normalize_column_name(
        input_str, in_jinja=True) == expected_in_jinja
Example #3
0
def get_nested_hashed_table_name(name_transformer: DestinationNameTransformer, schema: str, json_path: List[str], child: str) -> str:
    """
    In normalization code base, we often have to deal with naming for tables, combining informations from:
    - parent table: to denote where a table is extracted from (in case of nesting)
    - child table: in case of nesting, the field name or the original stream name
    - extra suffix: normalization is done in multiple transformation steps, each may need to generate separate tables,
    so we can add a suffix to distinguish the different transformation steps of a pipeline.
    - json path: in terms of parent and nested field names in order to reach the table currently being built

    All these informations should be included (if possible) in the table naming for the user to (somehow) identify and
    recognize what data is available there.
    """
    parent = "_".join(json_path[:-1])
    max_length = name_transformer.get_name_max_length()
    json_path_hash = hash_json_path([schema] + json_path)
    norm_parent = parent if not parent else name_transformer.normalize_table_name(parent, False, False)
    norm_child = name_transformer.normalize_table_name(child, False, False)
    min_parent_length = min(MINIMUM_PARENT_LENGTH, len(norm_parent))

    # no parent
    if not parent:
        raise RuntimeError("There is no nested table names without parents")
    # if everything fits without truncation, don't truncate anything
    elif (len(norm_parent) + len(json_path_hash) + len(norm_child) + 2) < max_length:
        return f"{norm_parent}_{json_path_hash}_{norm_child}"
    # if everything fits except for the parent, just truncate the parent (still guarantees parent is of length min_parent_length)
    elif (min_parent_length + len(json_path_hash) + len(norm_child) + 2) < max_length:
        max_parent_length = max_length - len(json_path_hash) - len(norm_child) - 2
        return f"{norm_parent[:max_parent_length]}_{json_path_hash}_{norm_child}"
    # otherwise first truncate parent to the minimum length and middle truncate the child too
    else:
        norm_child_max_length = max_length - len(json_path_hash) - 2 - min_parent_length
        trunc_norm_child = name_transformer.truncate_identifier_name(norm_child, norm_child_max_length)
        return f"{norm_parent[:min_parent_length]}_{json_path_hash}_{trunc_norm_child}"
Example #4
0
    def build_stream_processor(
        catalog: Dict,
        json_column_name: str,
        default_schema: str,
        name_transformer: DestinationNameTransformer,
        destination_type: DestinationType,
        tables_registry: TableNameRegistry,
    ) -> List[StreamProcessor]:
        result = []
        for configured_stream in get_field(catalog, "streams", "Invalid Catalog: 'streams' is not defined in Catalog"):
            stream_config = get_field(configured_stream, "stream", "Invalid Stream: 'stream' is not defined in Catalog streams")

            # The logic here matches the logic in JdbcBufferedConsumerFactory.java.
            # Any modifications need to be reflected there and vice versa.
            schema = default_schema
            if "namespace" in stream_config:
                schema = stream_config["namespace"]

            schema_name = name_transformer.normalize_schema_name(schema, truncate=False)
            raw_schema_name = name_transformer.normalize_schema_name(f"_airbyte_{schema}", truncate=False)
            stream_name = get_field(stream_config, "name", f"Invalid Stream: 'name' is not defined in stream: {str(stream_config)}")
            raw_table_name = name_transformer.normalize_table_name(f"_airbyte_raw_{stream_name}", truncate=False)

            source_sync_mode = get_source_sync_mode(configured_stream, stream_name)
            destination_sync_mode = get_destination_sync_mode(configured_stream, stream_name)
            cursor_field = []
            primary_key = []
            if source_sync_mode.value == SyncMode.incremental.value or destination_sync_mode.value in [
                # DestinationSyncMode.upsert_dedup.value,
                DestinationSyncMode.append_dedup.value,
            ]:
                cursor_field = get_field(configured_stream, "cursor_field", f"Undefined cursor field for stream {stream_name}")
            if destination_sync_mode.value in [
                # DestinationSyncMode.upsert_dedup.value,
                DestinationSyncMode.append_dedup.value
            ]:
                primary_key = get_field(configured_stream, "primary_key", f"Undefined primary key for stream {stream_name}")

            message = f"'json_schema'.'properties' are not defined for stream {stream_name}"
            properties = get_field(get_field(stream_config, "json_schema", message), "properties", message)

            from_table = "source('{}', '{}')".format(schema_name, raw_table_name)

            stream_processor = StreamProcessor.create(
                stream_name=stream_name,
                destination_type=destination_type,
                raw_schema=raw_schema_name,
                schema=schema_name,
                source_sync_mode=source_sync_mode,
                destination_sync_mode=destination_sync_mode,
                cursor_field=cursor_field,
                primary_key=primary_key,
                json_column_name=f"'{json_column_name}'",
                properties=properties,
                tables_registry=tables_registry,
                from_table=from_table,
            )
            result.append(stream_processor)
        return result
Example #5
0
def test_normalize_name(input_str: str, destination_type: str, expected: str,
                        expected_column: str):
    t = DestinationType.from_string(destination_type)
    assert DestinationNameTransformer(t).normalize_schema_name(
        input_str) == expected
    assert DestinationNameTransformer(t).normalize_table_name(
        input_str) == expected
    assert DestinationNameTransformer(t).normalize_column_name(
        input_str) == expected_column
    def build_stream_processor(
        catalog: Dict,
        json_column_name: str,
        target_schema: str,
        name_transformer: DestinationNameTransformer,
        destination_type: DestinationType,
        tables_registry: Set[str],
    ) -> List[StreamProcessor]:
        result = []
        for configured_stream in get_field(
                catalog, "streams",
                "Invalid Catalog: 'streams' is not defined in Catalog"):
            stream_config = get_field(
                configured_stream, "stream",
                "Invalid Stream: 'stream' is not defined in Catalog streams")
            schema_name = name_transformer.normalize_schema_name(target_schema)
            raw_schema_name = name_transformer.normalize_schema_name(
                f"_airbyte_{target_schema}", truncate=False)
            stream_name = get_field(
                stream_config, "name",
                f"Invalid Stream: 'name' is not defined in stream: {str(stream_config)}"
            )
            raw_table_name = name_transformer.normalize_table_name(
                f"_airbyte_raw_{stream_name}", truncate=False)

            message = f"'json_schema'.'properties' are not defined for stream {stream_name}"
            properties = get_field(
                get_field(stream_config, "json_schema", message), "properties",
                message)

            from_table = "source('{}', '{}')".format(schema_name,
                                                     raw_table_name)

            # Check properties
            if not properties:
                raise EOFError(
                    "Invalid Catalog: Unexpected empty properties in catalog")

            stream_processor = StreamProcessor.create(
                stream_name=stream_name,
                destination_type=destination_type,
                raw_schema=raw_schema_name,
                schema=schema_name,
                json_column_name=f"'{json_column_name}'",
                properties=properties,
                tables_registry=tables_registry,
                from_table=from_table,
            )
            result.append(stream_processor)
        return result
Example #7
0
def test_truncate_identifier(input_str: str, expected: str):
    """
    Rules about truncations, for example for both of these strings which are too long for the postgres 64 limit:
    - `Aaaa_Bbbb_Cccc_Dddd_Eeee_Ffff_Gggg_Hhhh_Iiii`
    - `Aaaa_Bbbb_Cccc_Dddd_Eeee_a_very_long_name_Ffff_Gggg_Hhhh_Iiii`

    Deciding on how to truncate (in the middle) are being verified in these tests.
    In this instance, both strings ends up as:`Aaaa_Bbbb_Cccc_Dddd___e_Ffff_Gggg_Hhhh_Iiii`
    and can potentially cause a collision in table names.

    Note that dealing with such collisions is not part of `destination_name_transformer` but of the `stream_processor`.
    """
    name_transformer = DestinationNameTransformer(DestinationType.POSTGRES)
    print(f"Truncating from #{len(input_str)} to #{len(expected)}")
    assert name_transformer.truncate_identifier_name(input_str) == expected
Example #8
0
    def __init__(
        self,
        stream_name: str,
        destination_type: DestinationType,
        raw_schema: str,
        schema: str,
        json_column_name: str,
        properties: Dict,
        tables_registry: Set[str],
        from_table: str,
    ):
        """
        See StreamProcessor.create()
        """
        self.stream_name: str = stream_name
        self.destination_type: DestinationType = destination_type
        self.raw_schema: str = raw_schema
        self.schema: str = schema
        self.json_column_name: str = json_column_name
        self.properties: Dict = properties
        self.tables_registry: Set[str] = tables_registry
        self.from_table: str = from_table

        self.name_transformer: DestinationNameTransformer = DestinationNameTransformer(
            destination_type)
        self.json_path: List[str] = [stream_name]
        self.final_table_name: str = ""
        self.sql_outputs: Dict[str, str] = {}
        self.local_registry: Set[str] = set()
        self.parent: Optional["StreamProcessor"] = None
        self.is_nested_array: bool = False
def test_get_nested_hashed_table_name(json_path: List[str], expected_postgres: str, expected_bigquery: str):
    """
    Checks how to generate a unique name with strategies of combining all fields into a single table name for the user to (somehow)
    identify and recognize what data is available in there.
    A set of complicated rules are done in order to choose what parts to truncate or what to leave and handle
    name collisions.
    """
    child = json_path[-1]
    postgres_name_transformer = DestinationNameTransformer(DestinationType.POSTGRES)
    actual_postgres_name = get_nested_hashed_table_name(postgres_name_transformer, "schema", json_path, child)
    assert actual_postgres_name == expected_postgres
    assert len(actual_postgres_name) <= 43  # explicitly check for our max postgres length in case tests are changed in the future

    bigquery_name_transformer = DestinationNameTransformer(DestinationType.BIGQUERY)
    actual_bigquery_name = get_nested_hashed_table_name(bigquery_name_transformer, "schema", json_path, child)
    assert actual_bigquery_name == expected_bigquery
Example #10
0
 def __init__(self, output_directory: str, destination_type: DestinationType):
     """
     @param output_directory is the path to the directory where this processor should write the resulting SQL files (DBT models)
     @param destination_type is the destination type of warehouse
     """
     self.output_directory: str = output_directory
     self.destination_type: DestinationType = destination_type
     self.name_transformer: DestinationNameTransformer = DestinationNameTransformer(destination_type)
def test_resolve_names(destination_type: DestinationType, catalog_file: str):
    """
    For a given catalog.json and destination, multiple cases can occur where naming becomes tricky.
    (especially since some destination like postgres have a very low limit to identifiers length of 64 characters)

    In case of nested objects/arrays in a stream, names can drag on to very long names.
    Tests are built here using resources files as follow:
    - `<name of source or test types>_catalog.json`:
        input catalog.json, typically as what source would provide.
        For example Hubspot, Stripe and Facebook catalog.json contains some level of nesting.
        (here, nested_catalog.json is an extracted smaller sample of stream/properties from the facebook catalog)
    - `<name of source or test types>_expected_names.json`:
        list of expected table names

    For the expected json files, it is possible to specialize the file to a certain destination.
    So if for example, the resources folder contains these two expected files:
        - edge_cases_catalog_expected_names.json
        - edge_cases_catalog_expected_postgres_names.json
    Then the test will be using the first edge_cases_catalog_expected_names.json except for
    Postgres destination where the expected table names will come from edge_cases_catalog_expected_postgres_names.json

    The content of the expected_*.json files are the serialization of the stream_processor.tables_registry.registry
    """
    integration_type = destination_type.value
    tables_registry = TableNameRegistry(destination_type)

    catalog = read_json(f"resources/{catalog_file}.json")

    # process top level
    stream_processors = CatalogProcessor.build_stream_processor(
        catalog=catalog,
        json_column_name="'json_column_name_test'",
        default_schema="schema_test",
        name_transformer=DestinationNameTransformer(destination_type),
        destination_type=destination_type,
        tables_registry=tables_registry,
    )
    for stream_processor in stream_processors:
        # Check properties
        if not stream_processor.properties:
            raise EOFError("Invalid Catalog: Unexpected empty properties in catalog")
        stream_processor.collect_table_names()
    for conflict in tables_registry.resolve_names():
        print(
            f"WARN: Resolving conflict: {conflict.schema}.{conflict.table_name_conflict} "
            f"from '{'.'.join(conflict.json_path)}' into {conflict.table_name_resolved}"
        )
    apply_function = identity
    if DestinationType.SNOWFLAKE.value == destination_type.value:
        apply_function = str.upper
    elif DestinationType.REDSHIFT.value == destination_type.value:
        apply_function = str.lower
    if os.path.exists(f"resources/{catalog_file}_expected_{integration_type.lower()}_names.json"):
        expected_names = read_json(f"resources/{catalog_file}_expected_{integration_type.lower()}_names.json", apply_function)
    else:
        expected_names = read_json(f"resources/{catalog_file}_expected_names.json", apply_function)

    assert tables_registry.to_dict(apply_function) == expected_names
def test_get_table_name(root_table: str, base_table_name: str, suffix: str,
                        expected: str):
    name_transformer = DestinationNameTransformer(DestinationType.POSTGRES)
    name = get_table_name(name_transformer, root_table, base_table_name,
                          suffix, ["json", "path"])
    assert name == expected
    assert len(
        name
    ) <= 43  # explicitly check for our max postgres length in case tests are changed in the future
Example #13
0
def get_table_name(name_transformer: DestinationNameTransformer, parent: str,
                   child: str, suffix: str, json_path: List[str]) -> str:
    """
    In normalization code base, we often have to deal with naming for tables, combining informations from:
    - parent table: to denote where a table is extracted from (in case of nesting)
    - child table: in case of nesting, the field name or the original stream name
    - extra suffix: normalization is done in multiple transformation steps, each may need to generate separate tables,
    so we can add a suffix to distinguish the different transformation steps of a pipeline.
    - json path: in terms of parent and nested field names in order to reach the table currently being built

    All these informations should be included (if possible) in the table naming for the user to (somehow) identify and
    recognize what data is available there.
    """
    max_length = name_transformer.get_name_max_length(
    ) - 2  # less two for the underscores
    json_path_hash = hash_json_path(json_path)
    norm_suffix = suffix if not suffix or suffix.startswith(
        "_") else f"_{suffix}"
    norm_parent = parent if not parent else name_transformer.normalize_table_name(
        parent, False, False)
    norm_child = name_transformer.normalize_table_name(child, False, False)
    min_parent_length = min(MINIMUM_PARENT_LENGTH, len(norm_parent))

    # no parent
    if not parent:
        return name_transformer.truncate_identifier_name(
            f"{norm_child}{norm_suffix}")
    # if everything fits without truncation, don't truncate anything
    elif (len(norm_parent) + len(norm_child) + len(json_path_hash) +
          len(norm_suffix)) < max_length:
        return f"{norm_parent}_{json_path_hash}_{norm_child}{norm_suffix}"
    # if everything fits except for the parent, just truncate the parent
    elif (len(norm_child) + len(json_path_hash) +
          len(norm_suffix)) < (max_length - min_parent_length):
        max_parent_length = max_length - len(norm_child) - len(
            json_path_hash) - len(norm_suffix)
        return f"{norm_parent[:max_parent_length]}_{json_path_hash}_{norm_child}{norm_suffix}"
    # otherwise first truncate parent to the minimum length and middle truncate the child
    else:
        norm_child_max_length = max_length - min_parent_length - len(
            json_path_hash) - len(norm_suffix)
        trunc_norm_child = name_transformer.truncate_identifier_name(
            norm_child, norm_child_max_length)
        return f"{norm_parent[:min_parent_length]}_{json_path_hash}_{trunc_norm_child}{norm_suffix}"
Example #14
0
 def __init__(self, destination_type: DestinationType):
     """
     @param destination_type is the destination type of warehouse
     """
     self.destination_type: DestinationType = destination_type
     self.name_transformer: DestinationNameTransformer = DestinationNameTransformer(destination_type)
     # Simple XXX registry are collecting "simple" XXX names (with potential collisions)
     self.simple_file_registry: NormalizedFilesRegistry = NormalizedFilesRegistry()
     self.simple_table_registry: NormalizedTablesRegistry = NormalizedTablesRegistry(self.name_transformer)
     # Registry is the collision free (resolved) mapping of schema json_path of the stream to the names that should be used
     self.registry: Dict[str, ResolvedNameMetadata] = {}
Example #15
0
def test_get_table_name(root_table: str, base_table_name: str, suffix: str,
                        expected: str):
    """
    - parent table: referred to as root table
    - child table: referred to as base table.
    - extra suffix: normalization steps used as suffix to decompose pipeline into multi steps.
    - json path: in terms of parent and nested field names in order to reach the table currently being built

    See documentation of get_table_name method for more details.

    But this test check the strategies of combining all those fields into a single table name for the user to (somehow) identify and
    recognize what data is available in there.
    A set of complicated rules are done in order to choose what parts to truncate or what to leave and handle
    name collisions.
    """
    name_transformer = DestinationNameTransformer(DestinationType.POSTGRES)
    name = get_table_name(name_transformer, root_table, base_table_name,
                          suffix, ["json", "path"])
    assert name == expected
    assert len(
        name
    ) <= 43  # explicitly check for our max postgres length in case tests are changed in the future
Example #16
0
    def __init__(
        self,
        stream_name: str,
        destination_type: DestinationType,
        raw_schema: str,
        schema: str,
        source_sync_mode: SyncMode,
        destination_sync_mode: DestinationSyncMode,
        cursor_field: List[str],
        primary_key: List[List[str]],
        json_column_name: str,
        properties: Dict,
        tables_registry: Dict[str, Dict[str, str]],
        from_table: str,
    ):
        """
        See StreamProcessor.create()
        """
        self.stream_name: str = stream_name
        self.destination_type: DestinationType = destination_type
        self.raw_schema: str = raw_schema
        self.schema: str = schema
        self.source_sync_mode: SyncMode = source_sync_mode
        self.destination_sync_mode: DestinationSyncMode = destination_sync_mode
        self.cursor_field: List[str] = cursor_field
        self.primary_key: List[List[str]] = primary_key
        self.json_column_name: str = json_column_name
        self.properties: Dict = properties
        self.tables_registry: Dict[str, Dict[str, str]] = tables_registry
        self.from_table: str = from_table

        self.name_transformer: DestinationNameTransformer = DestinationNameTransformer(
            destination_type)
        self.json_path: List[str] = [stream_name]
        self.final_table_name: str = ""
        self.sql_outputs: Dict[str, str] = {}
        self.local_registry: Dict[str, Dict[str, str]] = {}
        self.parent: Optional["StreamProcessor"] = None
        self.is_nested_array: bool = False
Example #17
0
def test_needs_quote(input_str: str, destination_type: str, expected: bool):
    name_transformer = DestinationNameTransformer(
        DestinationType.from_string(destination_type))
    assert name_transformer.needs_quotes(input_str) == expected
Example #18
0
    def build_stream_processor(
        catalog: Dict,
        json_column_name: str,
        default_schema: str,
        name_transformer: DestinationNameTransformer,
        destination_type: DestinationType,
        tables_registry: TableNameRegistry,
    ) -> List[StreamProcessor]:
        result = []
        for configured_stream in get_field(catalog, "streams", "Invalid Catalog: 'streams' is not defined in Catalog"):
            stream_config = get_field(configured_stream, "stream", "Invalid Stream: 'stream' is not defined in Catalog streams")

            # The logic here matches the logic in JdbcBufferedConsumerFactory.java.
            # Any modifications need to be reflected there and vice versa.
            schema = default_schema
            if "namespace" in stream_config:
                schema = stream_config["namespace"]

            schema_name = name_transformer.normalize_schema_name(schema, truncate=False)
            if destination_type == DestinationType.ORACLE:
                quote_in_parenthesis = re.compile(r"quote\((.*)\)")
                raw_schema_name = name_transformer.normalize_schema_name(schema, truncate=False)
                if not quote_in_parenthesis.findall(json_column_name):
                    json_column_name = name_transformer.normalize_column_name(json_column_name, in_jinja=True)
            else:
                column_inside_single_quote = re.compile(r"\'(.*)\'")
                raw_schema_name = name_transformer.normalize_schema_name(f"_airbyte_{schema}", truncate=False)
                if not column_inside_single_quote.findall(json_column_name):
                    json_column_name = f"'{json_column_name}'"

            stream_name = get_field(stream_config, "name", f"Invalid Stream: 'name' is not defined in stream: {str(stream_config)}")
            # MySQL table names need to be manually truncated, because it does not do it automatically
            truncate = destination_type == DestinationType.MYSQL
            raw_table_name = name_transformer.normalize_table_name(f"_airbyte_raw_{stream_name}", truncate=truncate)

            source_sync_mode = get_source_sync_mode(configured_stream, stream_name)
            destination_sync_mode = get_destination_sync_mode(configured_stream, stream_name)
            cursor_field = []
            primary_key = []
            if source_sync_mode.value == SyncMode.incremental.value or destination_sync_mode.value in [
                # DestinationSyncMode.upsert_dedup.value,
                DestinationSyncMode.append_dedup.value,
            ]:
                cursor_field = get_field(configured_stream, "cursor_field", f"Undefined cursor field for stream {stream_name}")
            if destination_sync_mode.value in [
                # DestinationSyncMode.upsert_dedup.value,
                DestinationSyncMode.append_dedup.value
            ]:
                primary_key = get_field(configured_stream, "primary_key", f"Undefined primary key for stream {stream_name}")

            message = f"'json_schema'.'properties' are not defined for stream {stream_name}"
            properties = get_field(get_field(stream_config, "json_schema", message), "properties", message)

            from_table = dbt_macro.Source(schema_name, raw_table_name)

            stream_processor = StreamProcessor.create(
                stream_name=stream_name,
                destination_type=destination_type,
                raw_schema=raw_schema_name,
                default_schema=default_schema,
                schema=schema_name,
                source_sync_mode=source_sync_mode,
                destination_sync_mode=destination_sync_mode,
                cursor_field=cursor_field,
                primary_key=primary_key,
                json_column_name=json_column_name,
                properties=properties,
                tables_registry=tables_registry,
                from_table=from_table,
            )
            result.append(stream_processor)
        return result
Example #19
0
    def build_stream_processor(
        catalog: Dict,
        json_column_name: str,
        target_schema: str,
        name_transformer: DestinationNameTransformer,
        destination_type: DestinationType,
        tables_registry: Set[str],
    ) -> List[StreamProcessor]:
        result = []
        for configured_stream in get_field(
                catalog, "streams",
                "Invalid Catalog: 'streams' is not defined in Catalog"):
            stream_config = get_field(
                configured_stream, "stream",
                "Invalid Stream: 'stream' is not defined in Catalog streams")
            schema_name = name_transformer.normalize_schema_name(target_schema)
            raw_schema_name = name_transformer.normalize_schema_name(
                f"_airbyte_{target_schema}", truncate=False)
            stream_name = get_field(
                stream_config, "name",
                f"Invalid Stream: 'name' is not defined in stream: {str(stream_config)}"
            )
            raw_table_name = name_transformer.normalize_table_name(
                f"_airbyte_raw_{stream_name}", truncate=False)

            source_sync_mode = get_source_sync_mode(configured_stream,
                                                    stream_name)
            destination_sync_mode = get_destination_sync_mode(
                configured_stream, stream_name)
            cursor_field = []
            primary_key = []
            if source_sync_mode.value == SyncMode.incremental.value or destination_sync_mode.value in [
                    # DestinationSyncMode.upsert_dedup.value,
                    DestinationSyncMode.append_dedup.value,
            ]:
                cursor_field = get_field(
                    configured_stream, "cursor_field",
                    f"Undefined cursor field for stream {stream_name}")
            if destination_sync_mode.value in [
                    # DestinationSyncMode.upsert_dedup.value,
                    DestinationSyncMode.append_dedup.value
            ]:
                primary_key = get_field(
                    configured_stream, "primary_key",
                    f"Undefined primary key for stream {stream_name}")

            message = f"'json_schema'.'properties' are not defined for stream {stream_name}"
            properties = get_field(
                get_field(stream_config, "json_schema", message), "properties",
                message)

            from_table = "source('{}', '{}')".format(schema_name,
                                                     raw_table_name)

            # Check properties
            if not properties:
                raise EOFError(
                    "Invalid Catalog: Unexpected empty properties in catalog")

            stream_processor = StreamProcessor.create(
                stream_name=stream_name,
                destination_type=destination_type,
                raw_schema=raw_schema_name,
                schema=schema_name,
                source_sync_mode=source_sync_mode,
                destination_sync_mode=destination_sync_mode,
                cursor_field=cursor_field,
                primary_key=primary_key,
                json_column_name=f"'{json_column_name}'",
                properties=properties,
                tables_registry=tables_registry,
                from_table=from_table,
            )
            result.append(stream_processor)
        return result
Example #20
0
def test_stream_processor_tables_naming(integration_type: str,
                                        catalog_file: str):
    """
    For a given catalog.json and destination, multiple cases can occur where naming becomes tricky.
    (especially since some destination like postgres have a very low limit to identifiers length of 64 characters)

    In case of nested objects/arrays in a stream, names can drag on to very long names.
    Tests are built here using resources files as follow:
    - `<name of source or test types>_catalog.json`:
        input catalog.json, typically as what source would provide.
        For example Hubspot, Stripe and Facebook catalog.json contains some level of nesting.
        (here, nested_catalog.json is an extracted smaller sample of stream/properties from the facebook catalog)
    - `<name of source or test types>_expected_top_level.json`:
        list of expected table names for the top level stream names
    - `<name of source or test types>_expected_nested.json`:
        list of expected table names for nested objects, extracted to their own and separate table names

    For the expected json files, it is possible to specialize the file to a certain destination.
    So if for example, the resources folder contains these two expected files:
        - edge_cases_catalog_expected_top_level.json
        - edge_cases_catalog_expected_top_level_postgres.json
    Then the test will be using the first edge_cases_catalog_expected_top_level.json except for
    Postgres destination where the expected table names will come from edge_cases_catalog_expected_top_level_postgres.json

    The content of the expected_*.json files are the serialization of the stream_processor.tables_registry
    (mapping per schema to all tables in that schema, mapping to the final filename)
    """
    destination_type = DestinationType.from_string(integration_type)
    tables_registry = {}

    substreams = []
    catalog = read_json(f"resources/{catalog_file}.json")

    # process top level
    for stream_processor in CatalogProcessor.build_stream_processor(
            catalog=catalog,
            json_column_name="'json_column_name_test'",
            default_schema="schema_test",
            name_transformer=DestinationNameTransformer(destination_type),
            destination_type=destination_type,
            tables_registry=tables_registry,
    ):
        nested_processors = stream_processor.process()
        tables_registry = add_table_to_registry(tables_registry,
                                                stream_processor)
        if nested_processors and len(nested_processors) > 0:
            substreams += nested_processors

    apply_function = None
    if DestinationType.SNOWFLAKE.value == destination_type.value:
        apply_function = str.upper
    elif DestinationType.REDSHIFT.value == destination_type.value:
        apply_function = str.lower
    if os.path.exists(
            f"resources/{catalog_file}_expected_top_level_{integration_type.lower()}.json"
    ):
        expected_top_level = read_json(
            f"resources/{catalog_file}_expected_top_level_{integration_type.lower()}.json",
            apply_function)
    else:
        expected_top_level = read_json(
            f"resources/{catalog_file}_expected_top_level.json",
            apply_function)

    assert tables_registry == expected_top_level

    # process substreams
    while substreams:
        children = substreams
        substreams = []
        for substream in children:
            substream.tables_registry = tables_registry
            nested_processors = substream.process()
            tables_registry = add_table_to_registry(tables_registry, substream)
            if nested_processors:
                substreams += nested_processors

    apply_function = None
    if DestinationType.SNOWFLAKE.value == destination_type.value:
        apply_function = str.upper
    elif DestinationType.REDSHIFT.value == destination_type.value:
        apply_function = str.lower
    if os.path.exists(
            f"resources/{catalog_file}_expected_nested_{integration_type.lower()}.json"
    ):
        expected_nested = read_json(
            f"resources/{catalog_file}_expected_nested_{integration_type.lower()}.json",
            apply_function)
    else:
        expected_nested = read_json(
            f"resources/{catalog_file}_expected_nested.json", apply_function)

    # remove expected top level tables from tables_registry
    for schema in expected_top_level:
        for table in expected_top_level[schema]:
            del tables_registry[schema][table]
        if len(tables_registry[schema]) == 0:
            del tables_registry[schema]
    assert tables_registry == expected_nested
Example #21
0
def test_stream_processor_tables_naming(integration_type: str,
                                        catalog_file: str, setup_test_path):
    destination_type = DestinationType.from_string(integration_type)
    tables_registry = set()

    substreams = []
    catalog = read_json(f"resources/{catalog_file}.json")

    # process top level
    for stream_processor in CatalogProcessor.build_stream_processor(
            catalog=catalog,
            json_column_name="'json_column_name_test'",
            target_schema="schema_test",
            name_transformer=DestinationNameTransformer(destination_type),
            destination_type=destination_type,
            tables_registry=tables_registry,
    ):
        nested_processors = stream_processor.process()
        for table in stream_processor.local_registry:
            found_sql_output = False
            for sql_output in stream_processor.sql_outputs:
                if re.match(r".*/" + table + ".sql", sql_output) is not None:
                    found_sql_output = True
                    break
            assert found_sql_output
        add_table_to_registry(tables_registry, stream_processor)
        if nested_processors and len(nested_processors) > 0:
            substreams += nested_processors

    if os.path.exists(
            f"resources/{catalog_file}_expected_top_level_{integration_type.lower()}.json"
    ):
        expected_top_level = set(
            read_json(
                f"resources/{catalog_file}_expected_top_level_{integration_type.lower()}.json"
            )["tables"])
    else:
        expected_top_level = set(
            read_json(f"resources/{catalog_file}_expected_top_level.json")
            ["tables"])
        if DestinationType.SNOWFLAKE.value == destination_type.value:
            expected_top_level = {
                table.upper()
                for table in expected_top_level
            }
        elif DestinationType.REDSHIFT.value == destination_type.value:
            expected_top_level = {
                table.lower()
                for table in expected_top_level
            }
    assert tables_registry == expected_top_level

    # process substreams
    while substreams:
        children = substreams
        substreams = []
        for substream in children:
            substream.tables_registry = tables_registry
            nested_processors = substream.process()
            add_table_to_registry(tables_registry, substream)
            if nested_processors:
                substreams += nested_processors

    if os.path.exists(
            f"resources/{catalog_file}_expected_nested_{integration_type.lower()}.json"
    ):
        expected_nested = set(
            read_json(
                f"resources/{catalog_file}_expected_nested_{integration_type.lower()}.json"
            )["tables"])
    else:
        expected_nested = set(
            read_json(f"resources/{catalog_file}_expected_nested.json")
            ["tables"])
        if DestinationType.SNOWFLAKE.value == destination_type.value:
            expected_nested = {table.upper() for table in expected_nested}
        elif DestinationType.REDSHIFT.value == destination_type.value:
            expected_nested = {table.lower() for table in expected_nested}
    assert (tables_registry - expected_top_level) == expected_nested
def test_stream_processor_tables_naming(integration_type: str,
                                        catalog_file: str, setup_test_path):
    destination_type = DestinationType.from_string(integration_type)
    tables_registry = {}

    substreams = []
    catalog = read_json(f"resources/{catalog_file}.json")

    # process top level
    for stream_processor in CatalogProcessor.build_stream_processor(
            catalog=catalog,
            json_column_name="'json_column_name_test'",
            default_schema="schema_test",
            name_transformer=DestinationNameTransformer(destination_type),
            destination_type=destination_type,
            tables_registry=tables_registry,
    ):
        nested_processors = stream_processor.process()
        for schema in stream_processor.local_registry:
            for table in stream_processor.local_registry[schema]:
                found_sql_output = False
                for sql_output in stream_processor.sql_outputs:
                    file_name = f"{schema}_{table}"
                    if len(
                            file_name
                    ) > stream_processor.name_transformer.get_name_max_length(
                    ):
                        file_name = stream_processor.name_transformer.truncate_identifier_name(
                            input_name=file_name)

                    if re.match(r".*/" + file_name + ".sql",
                                sql_output) is not None:
                        found_sql_output = True
                        break
                assert found_sql_output
        add_table_to_registry(tables_registry, stream_processor)
        if nested_processors and len(nested_processors) > 0:
            substreams += nested_processors

    if os.path.exists(
            f"resources/{catalog_file}_expected_top_level_{integration_type.lower()}.json"
    ):
        expected_top_level = set(
            read_json(
                f"resources/{catalog_file}_expected_top_level_{integration_type.lower()}.json"
            )["tables"])
    else:
        expected_top_level = set(
            read_json(f"resources/{catalog_file}_expected_top_level.json")
            ["tables"])
        if DestinationType.SNOWFLAKE.value == destination_type.value:
            expected_top_level = {
                table.upper()
                for table in expected_top_level
            }
        elif DestinationType.REDSHIFT.value == destination_type.value:
            expected_top_level = {
                table.lower()
                for table in expected_top_level
            }

    # process substreams
    while substreams:
        children = substreams
        substreams = []
        for substream in children:
            substream.tables_registry = tables_registry
            nested_processors = substream.process()
            add_table_to_registry(tables_registry, substream)
            if nested_processors:
                substreams += nested_processors

    if os.path.exists(
            f"resources/{catalog_file}_expected_nested_{integration_type.lower()}.json"
    ):
        expected_nested = set(
            read_json(
                f"resources/{catalog_file}_expected_nested_{integration_type.lower()}.json"
            )["tables"])
    else:
        expected_nested = set(
            read_json(f"resources/{catalog_file}_expected_nested.json")
            ["tables"])
        if DestinationType.SNOWFLAKE.value == destination_type.value:
            expected_nested = {table.upper() for table in expected_nested}
        elif DestinationType.REDSHIFT.value == destination_type.value:
            expected_nested = {table.lower() for table in expected_nested}

    # TODO(davin): Instead of unwrapping all tables, rewrite this test so tables are compared based on schema.
    all_tables = set()
    for schema in tables_registry:
        for tables in tables_registry[schema]:
            all_tables.add(tables)

    assert (all_tables - expected_top_level) == expected_nested
def test_truncate_identifier(input_str: str, expected: str):
    name_transformer = DestinationNameTransformer(DestinationType.POSTGRES)
    print(f"Truncating from #{len(input_str)} to #{len(expected)}")
    assert name_transformer.truncate_identifier_name(input_str) == expected