Esempio n. 1
0
 def unnesting_before_query(self) -> str:
     if self.parent and self.is_nested_array:
         parent_file_name = (
             f"'{self.tables_registry.get_file_name(self.parent.get_schema(False), self.parent.json_path, self.parent.stream_name, '')}'"
         )
         parent_stream_name = f"'{self.parent.normalized_stream_name()}'"
         quoted_field = self.name_transformer.normalize_column_name(
             self.stream_name, in_jinja=True)
         return jinja_call(
             f"unnest_cte({parent_file_name}, {parent_stream_name}, {quoted_field})"
         )
     return ""
Esempio n. 2
0
    def unnesting_after_query(self) -> str:
        result = ""
        if self.parent:
            cross_join = ""
            if self.is_nested_array:
                parent_stream_name = f"'{self.parent.normalized_stream_name()}'"
                quoted_field = self.name_transformer.normalize_column_name(self.stream_name, in_jinja=True)
                cross_join = jinja_call(f"cross_join_unnest({parent_stream_name}, {quoted_field})")
            column_name = self.name_transformer.normalize_column_name(self.stream_name)
            result = f"""
{cross_join}
where {column_name} is not null"""
        return result
Esempio n. 3
0
    def add_to_outputs(self,
                       sql: str,
                       is_intermediate: bool,
                       column_count: int = 0,
                       suffix: str = "") -> str:
        schema = self.get_schema(is_intermediate)
        # MySQL table names need to be manually truncated, because it does not do it automatically
        truncate_name = self.destination_type == DestinationType.MYSQL
        table_name = self.tables_registry.get_table_name(
            schema, self.json_path, self.stream_name, suffix, truncate_name)
        file_name = self.tables_registry.get_file_name(schema, self.json_path,
                                                       self.stream_name,
                                                       suffix, truncate_name)
        file = f"{file_name}.sql"
        if is_intermediate:
            if column_count <= MAXIMUM_COLUMNS_TO_USE_EPHEMERAL:
                output = os.path.join("airbyte_ctes", self.schema, file)
            else:
                # dbt throws "maximum recursion depth exceeded" exception at runtime
                # if ephemeral is used with large number of columns, use views instead
                output = os.path.join("airbyte_views", self.schema, file)
        else:
            output = os.path.join("airbyte_tables", self.schema, file)
        tags = self.get_model_tags(is_intermediate)
        # The alias() macro configs a model's final table name.
        if file_name != table_name:
            header = jinja_call(
                f'config(alias="{table_name}", schema="{schema}", tags=[{tags}])'
            )
        else:
            header = jinja_call(f'config(schema="{schema}", tags=[{tags}])')
        self.sql_outputs[output] = f"""
{header}
{sql}
"""
        json_path = self.current_json_path()
        print(f"  Generating {output} from {json_path}")
        return ref_table(file_name)
Esempio n. 4
0
 def cast_property_type(self, property_name: str, column_name: str, jinja_column: str) -> str:
     definition = self.properties[property_name]
     if "type" not in definition:
         print(f"WARN: Unknown type for column {property_name} at {self.current_json_path()}")
         return column_name
     elif is_array(definition["type"]):
         return self.cast_property_type_as_array(property_name, column_name)
     elif is_object(definition["type"]):
         sql_type = self.cast_property_type_as_object(property_name, column_name)
     # Treat simple types from narrower to wider scope type: boolean < integer < number < string
     elif is_boolean(definition["type"]):
         cast_operation = jinja_call(f"cast_to_boolean({jinja_column})")
         return f"{cast_operation} as {column_name}"
     elif is_integer(definition["type"]):
         sql_type = jinja_call("dbt_utils.type_bigint()")
     elif is_number(definition["type"]):
         sql_type = jinja_call("dbt_utils.type_float()")
     elif is_string(definition["type"]):
         sql_type = jinja_call("dbt_utils.type_string()")
     else:
         print(f"WARN: Unknown type {definition['type']} for column {property_name} at {self.current_json_path()}")
         return column_name
     return f"cast({column_name} as {sql_type}) as {column_name}"
Esempio n. 5
0
    def generate_scd_type_2_model(
            self, from_table: str, column_names: Dict[str, Tuple[str,
                                                                 str]]) -> str:
        template = Template("""
-- SQL model to build a Type 2 Slowly Changing Dimension (SCD) table for each record identified by their primary key
select
  {%- if parent_hash_id %}
    {{ parent_hash_id }},
  {%- endif %}
  {%- for field in fields %}
    {{ field }},
  {%- endfor %}
    {{ cursor_field }} as _airbyte_start_at,
    lag({{ cursor_field }}) over (
        partition by {{ primary_key }}
        order by {{ cursor_field }} desc, _airbyte_emitted_at desc
    ) as _airbyte_end_at,
    lag({{ cursor_field }}) over (
        partition by {{ primary_key }}
        order by {{ cursor_field }} desc, _airbyte_emitted_at desc{{ cdc_updated_at_order }}
    ) is null {{ cdc_active_row }}as _airbyte_active_row,
    _airbyte_emitted_at,
    {{ hash_id }}
from {{ from_table }}
{{ sql_table_comment }}
        """)

        cdc_active_row_pattern = ""
        cdc_updated_order_pattern = ""
        if "_ab_cdc_deleted_at" in column_names.keys():
            cdc_active_row_pattern = "and _ab_cdc_deleted_at is null "
            cdc_updated_order_pattern = ", _ab_cdc_updated_at desc"

        sql = template.render(
            parent_hash_id=self.parent_hash_id(),
            fields=self.list_fields(column_names),
            cursor_field=self.get_cursor_field(column_names),
            primary_key=self.get_primary_key(column_names),
            hash_id=self.hash_id(),
            from_table=jinja_call(from_table),
            sql_table_comment=self.sql_table_comment(include_from_table=True),
            cdc_active_row=cdc_active_row_pattern,
            cdc_updated_at_order=cdc_updated_order_pattern,
        )
        return sql
 def __normalize_identifier_name(self, column_name: str, in_jinja: bool = False, truncate: bool = True) -> str:
     result = self.__normalize_naming_conventions(column_name)
     if truncate:
         result = self.__truncate_identifier_name(result)
     if self.needs_quotes(result):
         result = result.replace('"', '""')
         result = result.replace("'", "\\'")
         result = f"adapter.quote('{result}')"
         result = self.__normalize_identifier_case(result, is_quoted=True)
         if not in_jinja:
             result = jinja_call(result)
         return result
     else:
         result = self.__normalize_identifier_case(result, is_quoted=False)
     if in_jinja:
         # to refer to columns while already in jinja context, always quote
         return f"'{result}'"
     return result
Esempio n. 7
0
    def generate_dedup_record_model(
            self, from_table: str, column_names: Dict[str, Tuple[str,
                                                                 str]]) -> str:
        template = Template("""
-- SQL model to prepare for deduplicating records based on the hash record column
select
  *,
  row_number() over (
    partition by {{ hash_id }}
    order by _airbyte_emitted_at asc
  ) as _airbyte_row_num
from {{ from_table }}
{{ sql_table_comment }}
        """)
        sql = template.render(
            hash_id=self.hash_id(),
            from_table=jinja_call(from_table),
            sql_table_comment=self.sql_table_comment(include_from_table=True),
        )
        return sql
Esempio n. 8
0
    def add_to_outputs(self,
                       sql: str,
                       is_intermediate: bool,
                       suffix: str = "") -> str:
        schema = self.get_schema(is_intermediate)
        table_name = self.generate_new_table_name(is_intermediate, suffix)
        self.add_table_to_local_registry(table_name)
        file = f"{table_name}.sql"
        if is_intermediate:
            output = os.path.join("airbyte_views", self.schema, file)
        else:
            output = os.path.join("airbyte_tables", self.schema, file)
        tags = self.get_model_tags(is_intermediate)
        header = jinja_call(f'config(schema="{schema}", tags=[{tags}])')
        self.sql_outputs[output] = f"""
{header}
{sql}
"""
        json_path = self.current_json_path()
        print(f"  Generating {output} from {json_path}")
        return ref_table(table_name)
Esempio n. 9
0
    def generate_column_typing_model(
            self, from_table: str, column_names: Dict[str, Tuple[str,
                                                                 str]]) -> str:
        template = Template("""
select
  {%- if parent_hash_id %}
    {{ parent_hash_id }},
  {%- endif %}
  {%- for field in fields %}
    {{ field }},
  {%- endfor %}
    _airbyte_emitted_at
from {{ from_table }}
{{ sql_table_comment }}
    """)
        sql = template.render(
            parent_hash_id=self.parent_hash_id(),
            fields=self.cast_property_types(column_names),
            from_table=jinja_call(from_table),
            sql_table_comment=self.sql_table_comment(),
        )
        return sql
Esempio n. 10
0
    def generate_final_model(self, from_table: str,
                             column_names: Dict[str, Tuple[str, str]]) -> str:
        template = Template("""
select
  {%- if parent_hash_id %}
    {{ parent_hash_id }},
  {%- endif %}
  {%- for field in fields %}
    {{ field }},
  {%- endfor %}
    _airbyte_emitted_at,
    {{ hash_id }}
from {{ from_table }}
{{ sql_table_comment }}
    """)
        sql = template.render(
            parent_hash_id=self.parent_hash_id(),
            fields=self.list_fields(column_names),
            hash_id=self.hash_id(),
            from_table=jinja_call(from_table),
            sql_table_comment=self.sql_table_comment(include_from_table=True),
        )
        return sql
Esempio n. 11
0
 def __normalize_identifier_name(
     self, column_name: str, in_jinja: bool = False, truncate: bool = True, conflict: bool = False, conflict_level: int = 0
 ) -> str:
     result = self.__normalize_naming_conventions(column_name, is_column=True)
     if truncate:
         result = self.truncate_identifier_name(input_name=result, conflict=conflict, conflict_level=conflict_level)
     if self.needs_quotes(result):
         if self.destination_type.value != DestinationType.MYSQL.value:
             result = result.replace('"', '""')
         else:
             result = result.replace("`", "_")
         result = result.replace("'", "\\'")
         result = self.__normalize_identifier_case(result, is_quoted=True)
         result = self.apply_quote(result)
         if not in_jinja:
             result = jinja_call(result)
         return result
     else:
         result = self.__normalize_identifier_case(result, is_quoted=False)
     if in_jinja:
         # to refer to columns while already in jinja context, always quote
         return f"'{result}'"
     return result
Esempio n. 12
0
 def cast_property_type_as_object(self, property_name: str,
                                  column_name: str) -> str:
     if self.destination_type.value == DestinationType.BIGQUERY.value:
         # TODO build a struct/record type from properties JSON schema
         pass
     return jinja_call("type_json()")