def unnesting_before_query(self) -> str: if self.parent and self.is_nested_array: parent_file_name = ( f"'{self.tables_registry.get_file_name(self.parent.get_schema(False), self.parent.json_path, self.parent.stream_name, '')}'" ) parent_stream_name = f"'{self.parent.normalized_stream_name()}'" quoted_field = self.name_transformer.normalize_column_name( self.stream_name, in_jinja=True) return jinja_call( f"unnest_cte({parent_file_name}, {parent_stream_name}, {quoted_field})" ) return ""
def unnesting_after_query(self) -> str: result = "" if self.parent: cross_join = "" if self.is_nested_array: parent_stream_name = f"'{self.parent.normalized_stream_name()}'" quoted_field = self.name_transformer.normalize_column_name(self.stream_name, in_jinja=True) cross_join = jinja_call(f"cross_join_unnest({parent_stream_name}, {quoted_field})") column_name = self.name_transformer.normalize_column_name(self.stream_name) result = f""" {cross_join} where {column_name} is not null""" return result
def add_to_outputs(self, sql: str, is_intermediate: bool, column_count: int = 0, suffix: str = "") -> str: schema = self.get_schema(is_intermediate) # MySQL table names need to be manually truncated, because it does not do it automatically truncate_name = self.destination_type == DestinationType.MYSQL table_name = self.tables_registry.get_table_name( schema, self.json_path, self.stream_name, suffix, truncate_name) file_name = self.tables_registry.get_file_name(schema, self.json_path, self.stream_name, suffix, truncate_name) file = f"{file_name}.sql" if is_intermediate: if column_count <= MAXIMUM_COLUMNS_TO_USE_EPHEMERAL: output = os.path.join("airbyte_ctes", self.schema, file) else: # dbt throws "maximum recursion depth exceeded" exception at runtime # if ephemeral is used with large number of columns, use views instead output = os.path.join("airbyte_views", self.schema, file) else: output = os.path.join("airbyte_tables", self.schema, file) tags = self.get_model_tags(is_intermediate) # The alias() macro configs a model's final table name. if file_name != table_name: header = jinja_call( f'config(alias="{table_name}", schema="{schema}", tags=[{tags}])' ) else: header = jinja_call(f'config(schema="{schema}", tags=[{tags}])') self.sql_outputs[output] = f""" {header} {sql} """ json_path = self.current_json_path() print(f" Generating {output} from {json_path}") return ref_table(file_name)
def cast_property_type(self, property_name: str, column_name: str, jinja_column: str) -> str: definition = self.properties[property_name] if "type" not in definition: print(f"WARN: Unknown type for column {property_name} at {self.current_json_path()}") return column_name elif is_array(definition["type"]): return self.cast_property_type_as_array(property_name, column_name) elif is_object(definition["type"]): sql_type = self.cast_property_type_as_object(property_name, column_name) # Treat simple types from narrower to wider scope type: boolean < integer < number < string elif is_boolean(definition["type"]): cast_operation = jinja_call(f"cast_to_boolean({jinja_column})") return f"{cast_operation} as {column_name}" elif is_integer(definition["type"]): sql_type = jinja_call("dbt_utils.type_bigint()") elif is_number(definition["type"]): sql_type = jinja_call("dbt_utils.type_float()") elif is_string(definition["type"]): sql_type = jinja_call("dbt_utils.type_string()") else: print(f"WARN: Unknown type {definition['type']} for column {property_name} at {self.current_json_path()}") return column_name return f"cast({column_name} as {sql_type}) as {column_name}"
def generate_scd_type_2_model( self, from_table: str, column_names: Dict[str, Tuple[str, str]]) -> str: template = Template(""" -- SQL model to build a Type 2 Slowly Changing Dimension (SCD) table for each record identified by their primary key select {%- if parent_hash_id %} {{ parent_hash_id }}, {%- endif %} {%- for field in fields %} {{ field }}, {%- endfor %} {{ cursor_field }} as _airbyte_start_at, lag({{ cursor_field }}) over ( partition by {{ primary_key }} order by {{ cursor_field }} desc, _airbyte_emitted_at desc ) as _airbyte_end_at, lag({{ cursor_field }}) over ( partition by {{ primary_key }} order by {{ cursor_field }} desc, _airbyte_emitted_at desc{{ cdc_updated_at_order }} ) is null {{ cdc_active_row }}as _airbyte_active_row, _airbyte_emitted_at, {{ hash_id }} from {{ from_table }} {{ sql_table_comment }} """) cdc_active_row_pattern = "" cdc_updated_order_pattern = "" if "_ab_cdc_deleted_at" in column_names.keys(): cdc_active_row_pattern = "and _ab_cdc_deleted_at is null " cdc_updated_order_pattern = ", _ab_cdc_updated_at desc" sql = template.render( parent_hash_id=self.parent_hash_id(), fields=self.list_fields(column_names), cursor_field=self.get_cursor_field(column_names), primary_key=self.get_primary_key(column_names), hash_id=self.hash_id(), from_table=jinja_call(from_table), sql_table_comment=self.sql_table_comment(include_from_table=True), cdc_active_row=cdc_active_row_pattern, cdc_updated_at_order=cdc_updated_order_pattern, ) return sql
def __normalize_identifier_name(self, column_name: str, in_jinja: bool = False, truncate: bool = True) -> str: result = self.__normalize_naming_conventions(column_name) if truncate: result = self.__truncate_identifier_name(result) if self.needs_quotes(result): result = result.replace('"', '""') result = result.replace("'", "\\'") result = f"adapter.quote('{result}')" result = self.__normalize_identifier_case(result, is_quoted=True) if not in_jinja: result = jinja_call(result) return result else: result = self.__normalize_identifier_case(result, is_quoted=False) if in_jinja: # to refer to columns while already in jinja context, always quote return f"'{result}'" return result
def generate_dedup_record_model( self, from_table: str, column_names: Dict[str, Tuple[str, str]]) -> str: template = Template(""" -- SQL model to prepare for deduplicating records based on the hash record column select *, row_number() over ( partition by {{ hash_id }} order by _airbyte_emitted_at asc ) as _airbyte_row_num from {{ from_table }} {{ sql_table_comment }} """) sql = template.render( hash_id=self.hash_id(), from_table=jinja_call(from_table), sql_table_comment=self.sql_table_comment(include_from_table=True), ) return sql
def add_to_outputs(self, sql: str, is_intermediate: bool, suffix: str = "") -> str: schema = self.get_schema(is_intermediate) table_name = self.generate_new_table_name(is_intermediate, suffix) self.add_table_to_local_registry(table_name) file = f"{table_name}.sql" if is_intermediate: output = os.path.join("airbyte_views", self.schema, file) else: output = os.path.join("airbyte_tables", self.schema, file) tags = self.get_model_tags(is_intermediate) header = jinja_call(f'config(schema="{schema}", tags=[{tags}])') self.sql_outputs[output] = f""" {header} {sql} """ json_path = self.current_json_path() print(f" Generating {output} from {json_path}") return ref_table(table_name)
def generate_column_typing_model( self, from_table: str, column_names: Dict[str, Tuple[str, str]]) -> str: template = Template(""" select {%- if parent_hash_id %} {{ parent_hash_id }}, {%- endif %} {%- for field in fields %} {{ field }}, {%- endfor %} _airbyte_emitted_at from {{ from_table }} {{ sql_table_comment }} """) sql = template.render( parent_hash_id=self.parent_hash_id(), fields=self.cast_property_types(column_names), from_table=jinja_call(from_table), sql_table_comment=self.sql_table_comment(), ) return sql
def generate_final_model(self, from_table: str, column_names: Dict[str, Tuple[str, str]]) -> str: template = Template(""" select {%- if parent_hash_id %} {{ parent_hash_id }}, {%- endif %} {%- for field in fields %} {{ field }}, {%- endfor %} _airbyte_emitted_at, {{ hash_id }} from {{ from_table }} {{ sql_table_comment }} """) sql = template.render( parent_hash_id=self.parent_hash_id(), fields=self.list_fields(column_names), hash_id=self.hash_id(), from_table=jinja_call(from_table), sql_table_comment=self.sql_table_comment(include_from_table=True), ) return sql
def __normalize_identifier_name( self, column_name: str, in_jinja: bool = False, truncate: bool = True, conflict: bool = False, conflict_level: int = 0 ) -> str: result = self.__normalize_naming_conventions(column_name, is_column=True) if truncate: result = self.truncate_identifier_name(input_name=result, conflict=conflict, conflict_level=conflict_level) if self.needs_quotes(result): if self.destination_type.value != DestinationType.MYSQL.value: result = result.replace('"', '""') else: result = result.replace("`", "_") result = result.replace("'", "\\'") result = self.__normalize_identifier_case(result, is_quoted=True) result = self.apply_quote(result) if not in_jinja: result = jinja_call(result) return result else: result = self.__normalize_identifier_case(result, is_quoted=False) if in_jinja: # to refer to columns while already in jinja context, always quote return f"'{result}'" return result
def cast_property_type_as_object(self, property_name: str, column_name: str) -> str: if self.destination_type.value == DestinationType.BIGQUERY.value: # TODO build a struct/record type from properties JSON schema pass return jinja_call("type_json()")