Beispiel #1
0
    def _construct_datalineage_urn(self, sql_table_name: str,
                                   looker_view: LookerView) -> str:
        logger.debug(f"sql_table_name={sql_table_name}")
        connection_def: LookerConnectionDefinition = looker_view.connection

        # Check if table name matches cascading derived tables pattern
        # derived tables can be referred to using aliases that look like table_name.SQL_TABLE_NAME
        # See https://docs.looker.com/data-modeling/learning-lookml/derived-tables#syntax_for_referencing_a_derived_table
        if re.fullmatch(r"\w+\.SQL_TABLE_NAME", sql_table_name, flags=re.I):
            sql_table_name = sql_table_name.lower().split(".")[0]
            # upstream dataset is a looker view based on current view id's project and model
            view_id = LookerViewId(
                project_name=looker_view.id.project_name,
                model_name=looker_view.id.model_name,
                view_name=sql_table_name,
            )
            return view_id.get_urn(self.source_config)

        # Ensure sql_table_name is in canonical form (add in db, schema names)
        sql_table_name = self._generate_fully_qualified_name(
            sql_table_name, connection_def)

        return builder.make_dataset_urn_with_platform_instance(
            platform=connection_def.platform,
            name=sql_table_name.lower(),
            platform_instance=connection_def.platform_instance,
            env=connection_def.platform_env or self.source_config.env,
        )
Beispiel #2
0
    def from_looker_dict(
        cls,
        project_name: str,
        model_name: str,
        looker_view: dict,
        connection: LookerConnectionDefinition,
        looker_viewfile: LookerViewFile,
        looker_viewfile_loader: LookerViewFileLoader,
        reporter: LookMLSourceReport,
        parse_table_names_from_sql: bool = False,
        sql_parser_path: str = "datahub.utilities.sql_parser.DefaultSQLParser",
    ) -> Optional["LookerView"]:
        view_name = looker_view["name"]
        logger.debug(f"Handling view {view_name} in model {model_name}")
        # The sql_table_name might be defined in another view and this view is extending that view,
        # so we resolve this field while taking that into account.
        sql_table_name: Optional[str] = LookerView.get_including_extends(
            view_name=view_name,
            looker_view=looker_view,
            connection=connection,
            looker_viewfile=looker_viewfile,
            looker_viewfile_loader=looker_viewfile_loader,
            field="sql_table_name",
            reporter=reporter,
        )

        # Some sql_table_name fields contain quotes like: optimizely."group", just remove the quotes
        sql_table_name = (sql_table_name.replace('"', "").replace("`", "")
                          if sql_table_name is not None else None)
        derived_table = looker_view.get("derived_table", None)

        dimensions = cls._get_fields(looker_view.get("dimensions", []),
                                     ViewFieldType.DIMENSION)
        dimension_groups = cls._get_fields(
            looker_view.get("dimension_groups", []),
            ViewFieldType.DIMENSION_GROUP)
        measures = cls._get_fields(looker_view.get("measures", []),
                                   ViewFieldType.MEASURE)
        fields: List[ViewField] = dimensions + dimension_groups + measures

        # Parse SQL from derived tables to extract dependencies
        if derived_table is not None:
            fields, sql_table_names = cls._extract_metadata_from_sql_query(
                reporter,
                parse_table_names_from_sql,
                sql_parser_path,
                view_name,
                sql_table_name,
                derived_table,
                fields,
            )
            # also store the view logic and materialization
            if "sql" in derived_table:
                view_logic = derived_table["sql"]
                view_lang = "sql"
            if "explore_source" in derived_table:
                view_logic = str(derived_table["explore_source"])
                view_lang = "lookml"

            materialized = False
            for k in derived_table:
                if k in [
                        "datagroup_trigger", "sql_trigger_value", "persist_for"
                ]:
                    materialized = True
            if "materialized_view" in derived_table:
                materialized = (True if derived_table["materialized_view"]
                                == "yes" else False)

            view_details = ViewProperties(materialized=materialized,
                                          viewLogic=view_logic,
                                          viewLanguage=view_lang)

            return LookerView(
                id=LookerViewId(
                    project_name=project_name,
                    model_name=model_name,
                    view_name=view_name,
                ),
                absolute_file_path=looker_viewfile.absolute_file_path,
                connection=connection,
                sql_table_names=sql_table_names,
                fields=fields,
                raw_file_content=looker_viewfile.raw_file_content,
                view_details=view_details,
            )

        # If not a derived table, then this view essentially wraps an existing
        # object in the database.
        if sql_table_name is not None:
            # If sql_table_name is set, there is a single dependency in the view, on the sql_table_name.
            sql_table_names = [sql_table_name]
        else:
            # Otherwise, default to the view name as per the docs:
            # https://docs.looker.com/reference/view-params/sql_table_name-for-view
            sql_table_names = [view_name]

        output_looker_view = LookerView(
            id=LookerViewId(project_name=project_name,
                            model_name=model_name,
                            view_name=view_name),
            absolute_file_path=looker_viewfile.absolute_file_path,
            sql_table_names=sql_table_names,
            connection=connection,
            fields=fields,
            raw_file_content=looker_viewfile.raw_file_content,
        )
        return output_looker_view
Beispiel #3
0
    def from_looker_dict(
        cls,
        project_name: str,
        model_name: str,
        looker_view: dict,
        connection: LookerConnectionDefinition,
        looker_viewfile: LookerViewFile,
        looker_viewfile_loader: LookerViewFileLoader,
        reporter: LookMLSourceReport,
        parse_table_names_from_sql: bool = False,
        sql_parser_path: str = "datahub.utilities.sql_parser.DefaultSQLParser",
    ) -> Optional["LookerView"]:
        view_name = looker_view["name"]
        logger.debug(f"Handling view {view_name} in model {model_name}")
        # The sql_table_name might be defined in another view and this view is extending that view,
        # so we resolve this field while taking that into account.
        sql_table_name: Optional[str] = LookerView.get_including_extends(
            view_name=view_name,
            looker_view=looker_view,
            connection=connection,
            looker_viewfile=looker_viewfile,
            looker_viewfile_loader=looker_viewfile_loader,
            field="sql_table_name",
            reporter=reporter,
        )

        # Some sql_table_name fields contain quotes like: optimizely."group", just remove the quotes
        sql_table_name = (sql_table_name.replace('"', "").replace("`", "")
                          if sql_table_name is not None else None)
        derived_table = looker_view.get("derived_table", None)

        dimensions = cls._get_fields(looker_view.get("dimensions", []),
                                     ViewFieldType.DIMENSION)
        dimension_groups = cls._get_fields(
            looker_view.get("dimension_groups", []),
            ViewFieldType.DIMENSION_GROUP)
        measures = cls._get_fields(looker_view.get("measures", []),
                                   ViewFieldType.MEASURE)
        fields: List[ViewField] = dimensions + dimension_groups + measures

        # Parse SQL from derived tables to extract dependencies
        if derived_table is not None:
            sql_table_names = []
            if parse_table_names_from_sql and "sql" in derived_table:
                logger.debug(
                    f"Parsing sql from derived table section of view: {view_name}"
                )
                # Get the list of tables in the query
                sql_table_names = cls._get_sql_table_names(
                    derived_table["sql"], sql_parser_path)

            return LookerView(
                id=LookerViewId(
                    project_name=project_name,
                    model_name=model_name,
                    view_name=view_name,
                ),
                absolute_file_path=looker_viewfile.absolute_file_path,
                connection=connection,
                sql_table_names=sql_table_names,
                fields=fields,
                raw_file_content=looker_viewfile.raw_file_content,
            )

        # If not a derived table, then this view essentially wraps an existing
        # object in the database.
        if sql_table_name is not None:
            # If sql_table_name is set, there is a single dependency in the view, on the sql_table_name.
            sql_table_names = [sql_table_name]
        else:
            # Otherwise, default to the view name as per the docs:
            # https://docs.looker.com/reference/view-params/sql_table_name-for-view
            sql_table_names = [view_name]

        output_looker_view = LookerView(
            id=LookerViewId(project_name=project_name,
                            model_name=model_name,
                            view_name=view_name),
            absolute_file_path=looker_viewfile.absolute_file_path,
            sql_table_names=sql_table_names,
            connection=connection,
            fields=fields,
            raw_file_content=looker_viewfile.raw_file_content,
        )
        return output_looker_view