Ejemplo n.º 1
0
    def get_workunits(self) -> Iterable[MetadataWorkUnit]:
        env: str = "PROD"
        platform = self.platform
        nodes = loadManifestAndCatalog(
            self.config.manifest_path, self.config.catalog_path, platform, env
        )

        for node in nodes:
            mce = MetadataChangeEvent()

            dataset_snapshot = DatasetSnapshot()
            dataset_snapshot.urn = node.datahub_urn
            custom_properties = get_custom_properties(node)

            dbt_properties = DatasetPropertiesClass()
            dbt_properties.description = node.dbt_name
            dbt_properties.customProperties = custom_properties

            dataset_snapshot.aspects.append(dbt_properties)

            upstreams = get_upstream_lineage(node.upstream_urns)
            if upstreams is not None:
                dataset_snapshot.aspects.append(upstreams)

            schema_metadata = get_schema_metadata(self.report, node, platform)
            dataset_snapshot.aspects.append(schema_metadata)

            mce.proposedSnapshot = dataset_snapshot
            wu = MetadataWorkUnit(id=dataset_snapshot.urn, mce=mce)
            self.report.report_workunit(wu)

            yield wu
Ejemplo n.º 2
0
    def get_workunits(self):
        env: str = "PROD"
        sql_config = self.config
        platform = self.platform
        url = sql_config.get_sql_alchemy_url()
        logger.debug(f"sql_alchemy_url={url}")
        engine = create_engine(url, **sql_config.options)
        inspector = reflection.Inspector.from_engine(engine)
        database = sql_config.database
        for schema in inspector.get_schema_names():
            for table in inspector.get_table_names(schema):
                if database != "":
                    dataset_name = f"{database}.{schema}.{table}"
                else:
                    dataset_name = f"{schema}.{table}"
                self.report.report_table_scanned(dataset_name)

                if sql_config.table_pattern.allowed(dataset_name):
                    columns = inspector.get_columns(table, schema)
                    mce = MetadataChangeEvent()

                    dataset_snapshot = DatasetSnapshot()
                    dataset_snapshot.urn = f"urn:li:dataset:(urn:li:dataPlatform:{platform},{dataset_name},{env})"
                    schema_metadata = get_schema_metadata(
                        self.report, dataset_name, platform, columns)
                    dataset_snapshot.aspects.append(schema_metadata)
                    mce.proposedSnapshot = dataset_snapshot

                    wu = SqlWorkUnit(id=dataset_name, mce=mce)
                    self.report.report_workunit(wu)
                    yield wu
                else:
                    self.report.report_dropped(dataset_name)
Ejemplo n.º 3
0
    def get_workunits(self) -> Iterable[SqlWorkUnit]:
        env: str = "PROD"
        sql_config = self.config
        platform = self.platform
        url = sql_config.get_sql_alchemy_url()
        logger.debug(f"sql_alchemy_url={url}")
        engine = create_engine(url, **sql_config.options)
        inspector = reflection.Inspector.from_engine(engine)
        for schema in inspector.get_schema_names():
            if not sql_config.schema_pattern.allowed(schema):
                self.report.report_dropped(schema)
                continue

            for table in inspector.get_table_names(schema):
                schema, table = sql_config.standardize_schema_table_names(schema, table)
                dataset_name = sql_config.get_identifier(schema, table)
                self.report.report_table_scanned(dataset_name)

                if not sql_config.table_pattern.allowed(dataset_name):
                    self.report.report_dropped(dataset_name)
                    continue

                columns = inspector.get_columns(table, schema)
                try:
                    description: Optional[str] = inspector.get_table_comment(
                        table, schema
                    )["text"]
                except NotImplementedError:
                    description = None

                # TODO: capture inspector.get_pk_constraint
                # TODO: capture inspector.get_sorted_table_and_fkc_names

                mce = MetadataChangeEvent()

                dataset_snapshot = DatasetSnapshot()
                dataset_snapshot.urn = f"urn:li:dataset:(urn:li:dataPlatform:{platform},{dataset_name},{env})"
                if description is not None:
                    dataset_properties = DatasetPropertiesClass(
                        description=description,
                        tags=[],
                        customProperties={},
                        # uri=dataset_name,
                    )
                    dataset_snapshot.aspects.append(dataset_properties)
                schema_metadata = get_schema_metadata(
                    self.report, dataset_name, platform, columns
                )
                dataset_snapshot.aspects.append(schema_metadata)
                mce.proposedSnapshot = dataset_snapshot

                wu = SqlWorkUnit(id=dataset_name, mce=mce)
                self.report.report_workunit(wu)
                yield wu
Ejemplo n.º 4
0
def create_metadata_work_unit(timestamp):
    mce = MetadataChangeEvent()
    dataset_snapshot = DatasetSnapshot(
        urn=
        "urn:li:dataset:(urn:li:dataPlatform:glue,datalake_grilled.Barbeque,PROD)",
        aspects=[],
    )
    dataset_snapshot.aspects.append(
        OwnershipClass(
            owners=[
                OwnerClass(owner="urn:li:corpuser:Susan",
                           type=OwnershipTypeClass.DATAOWNER)
            ],
            lastModified=AuditStampClass(time=timestamp,
                                         actor="urn:li:corpuser:datahub"),
        ))

    dataset_snapshot.aspects.append(
        DatasetPropertiesClass(
            description="Grilled Food",
            customProperties={},
            uri=None,
            tags=[],
        ))
    dataset_snapshot.aspects.append(Status(removed=False))

    mce.proposedSnapshot = dataset_snapshot

    fields = [
        SchemaField(
            fieldPath="Size",
            nativeDataType="int",
            type=SchemaFieldDataType(type=NumberTypeClass()),
            description="Maximum attendees permitted",
            nullable=True,
            recursive=False,
        )
    ]

    schema_metadata = SchemaMetadata(
        schemaName="datalake_grilled.Barbeque",
        version=0,
        fields=fields,
        platform="urn:li:dataPlatform:glue",
        created=AuditStamp(time=timestamp, actor="urn:li:corpuser:etl"),
        lastModified=AuditStamp(time=timestamp, actor="urn:li:corpuser:etl"),
        hash="",
        platformSchema=MySqlDDL(tableSchema=""),
    )
    dataset_snapshot.aspects.append(schema_metadata)
    return MetadataWorkUnit(id="glue-datalake_grilled.Barbeque", mce=mce)
Ejemplo n.º 5
0
    def _extract_record(self, topic: str) -> MetadataChangeEvent:
        logger.debug(f"topic = {topic}")
        platform = "kafka"
        dataset_name = topic
        env = "PROD"  # TODO: configure!
        actor, sys_time = "urn:li:corpuser:etl", int(time.time() * 1000)

        metadata_record = MetadataChangeEvent()
        dataset_snapshot = DatasetSnapshot(
            urn=
            f"urn:li:dataset:(urn:li:dataPlatform:{platform},{dataset_name},{env})",
            aspects=[],  # we append to this list later on
        )
        dataset_snapshot.aspects.append(Status(removed=False))
        metadata_record.proposedSnapshot = dataset_snapshot

        # Fetch schema from the registry.
        has_schema = True
        try:
            registered_schema = self.schema_registry_client.get_latest_version(
                topic + "-value")
            schema = registered_schema.schema
        except Exception as e:
            self.report.report_warning(topic, f"failed to get schema: {e}")
            has_schema = False

        # Parse the schema
        fields: List[SchemaField] = []
        if has_schema and schema.schema_type == "AVRO":
            fields = schema_util.avro_schema_to_mce_fields(schema.schema_str)
        elif has_schema:
            self.report.report_warning(
                topic,
                f"unable to parse kafka schema type {schema.schema_type}")

        if has_schema:
            schema_metadata = SchemaMetadata(
                schemaName=topic,
                version=0,
                hash=str(schema._hash),
                platform=f"urn:li:dataPlatform:{platform}",
                platformSchema=KafkaSchema(documentSchema=schema.schema_str),
                fields=fields,
                created=AuditStamp(time=sys_time, actor=actor),
                lastModified=AuditStamp(time=sys_time, actor=actor),
            )
            dataset_snapshot.aspects.append(schema_metadata)

        return metadata_record
Ejemplo n.º 6
0
    def get_feature_table_wu(self, ingest_table):
        """
        Generate an MLFeatureTable workunit for a Feast feature table.

        Parameters
        ----------
            ingest_table:
                ingested Feast table
        """

        featuretable_snapshot = MLFeatureTableSnapshot(
            urn=builder.make_ml_feature_table_urn("feast",
                                                  ingest_table["name"]),
            aspects=[],
        )

        featuretable_snapshot.aspects.append(
            MLFeatureTablePropertiesClass(
                mlFeatures=[
                    builder.make_ml_feature_urn(
                        ingest_table["name"],
                        feature["name"],
                    ) for feature in ingest_table["features"]
                ],
                # a feature table can have multiple primary keys, which then act as a composite key
                mlPrimaryKeys=[
                    builder.make_ml_primary_key_urn(ingest_table["name"],
                                                    entity["name"])
                    for entity in ingest_table["entities"]
                ],
            ))

        # make the MCE and workunit
        mce = MetadataChangeEvent(proposedSnapshot=featuretable_snapshot)
        return MetadataWorkUnit(id=ingest_table["name"], mce=mce)
Ejemplo n.º 7
0
    def _make_dashboard_and_chart_mces(
            self,
            looker_dashboard: LookerDashboard) -> List[MetadataChangeEvent]:

        chart_mces = [
            self._make_chart_mce(element)
            for element in looker_dashboard.dashboard_elements
        ]

        dashboard_urn = builder.make_dashboard_urn(
            self.source_config.platform_name,
            looker_dashboard.get_urn_dashboard_id())
        dashboard_snapshot = DashboardSnapshot(
            urn=dashboard_urn,
            aspects=[],
        )

        dashboard_info = DashboardInfoClass(
            description=looker_dashboard.description or "",
            title=looker_dashboard.title,
            charts=[mce.proposedSnapshot.urn for mce in chart_mces],
            lastModified=ChangeAuditStamps(),
            dashboardUrl=looker_dashboard.url(self.source_config.base_url),
        )

        dashboard_snapshot.aspects.append(dashboard_info)
        dashboard_snapshot.aspects.append(
            Status(removed=looker_dashboard.is_deleted))

        dashboard_mce = MetadataChangeEvent(
            proposedSnapshot=dashboard_snapshot)

        return chart_mces + [dashboard_mce]
Ejemplo n.º 8
0
    def _make_chart_mce(
            self,
            dashboard_element: LookerDashboardElement) -> MetadataChangeEvent:
        chart_urn = builder.make_chart_urn(
            self.source_config.platform_name,
            dashboard_element.get_urn_element_id())
        chart_snapshot = ChartSnapshot(
            urn=chart_urn,
            aspects=[],
        )

        chart_type = self._get_chart_type(dashboard_element)

        chart_info = ChartInfoClass(
            type=chart_type,
            description=dashboard_element.description or "",
            title=dashboard_element.title or "",
            lastModified=ChangeAuditStamps(),
            chartUrl=dashboard_element.url(self.source_config.base_url),
            inputs=dashboard_element.get_view_urns(
                self.source_config.platform_name, self.source_config.env),
        )
        chart_snapshot.aspects.append(chart_info)

        return MetadataChangeEvent(proposedSnapshot=chart_snapshot)
Ejemplo n.º 9
0
 def get_metadata_change_event(
     self, snap_shot: Union["DatasetSnapshot", "DashboardSnapshot", "ChartSnapshot"]
 ) -> MetadataWorkUnit:
     mce = MetadataChangeEvent(proposedSnapshot=snap_shot)
     work_unit = MetadataWorkUnit(id=snap_shot.urn, mce=mce)
     self.report.report_workunit(work_unit)
     return work_unit
Ejemplo n.º 10
0
 def build_wu(
     self, dataset_snapshot: DatasetSnapshot, dataset_name: str
 ) -> Generator[ApiWorkUnit, None, None]:
     mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot)
     wu = ApiWorkUnit(id=dataset_name, mce=mce)
     self.report.report_workunit(wu)
     yield wu
Ejemplo n.º 11
0
    def emit_chart_mces(self) -> Iterable[MetadataWorkUnit]:
        # Space/collection -> report -> query -> Chart
        for space_token, space_name in self.space_tokens.items():
            reports = self._get_reports(space_token)
            for report in reports:
                report_token = report.get("token", "")
                queries = self._get_queries(report_token)
                for query in queries:
                    charts = self._get_charts(report_token,
                                              query.get("token", ""))
                    # build charts
                    for chart in charts:
                        view = chart.get("view") or chart.get("view_vegas")
                        chart_name = view.get("title") or view.get(
                            "chartTitle") or ""
                        path = (f"/mode/{self.config.workspace}/{space_name}"
                                f"/{report.get('name')}/{query.get('name')}/"
                                f"{chart_name}")
                        chart_snapshot = self.construct_chart_from_api_data(
                            chart, query, path)
                        mce = MetadataChangeEvent(
                            proposedSnapshot=chart_snapshot)
                        wu = MetadataWorkUnit(id=chart_snapshot.urn, mce=mce)
                        self.report.report_workunit(wu)

                        yield wu
Ejemplo n.º 12
0
    def build_corp_user_mce(
            self, dn: str, attrs: dict,
            manager_ldap: Optional[str]) -> Optional[MetadataChangeEvent]:
        """
        Create the MetadataChangeEvent via DN and attributes.
        """
        ldap_user = guess_person_ldap(attrs)
        full_name = attrs["cn"][0].decode()
        first_name = attrs["givenName"][0].decode()
        last_name = attrs["sn"][0].decode()
        email = (attrs["mail"][0]).decode() if "mail" in attrs else ldap_user
        display_name = ((attrs["displayName"][0]).decode()
                        if "displayName" in attrs else full_name)
        department = ((attrs["departmentNumber"][0]).decode()
                      if "departmentNumber" in attrs else None)
        title = attrs["title"][0].decode() if "title" in attrs else None
        manager_urn = f"urn:li:corpuser:{manager_ldap}" if manager_ldap else None

        return MetadataChangeEvent(proposedSnapshot=CorpUserSnapshotClass(
            urn=f"urn:li:corpuser:{ldap_user}",
            aspects=[
                CorpUserInfoClass(
                    active=True,
                    email=email,
                    fullName=full_name,
                    firstName=first_name,
                    lastName=last_name,
                    departmentName=department,
                    displayName=display_name,
                    title=title,
                    managerUrn=manager_urn,
                )
            ],
        ))
Ejemplo n.º 13
0
    def _make_chart_mce(
            self,
            dashboard_element: LookerDashboardElement) -> MetadataChangeEvent:
        actor = self.source_config.actor
        sys_time = get_sys_time()
        chart_urn = f"urn:li:chart:({self.source_config.platform_name},{dashboard_element.get_urn_element_id()})"
        chart_snapshot = ChartSnapshot(
            urn=chart_urn,
            aspects=[],
        )

        last_modified = ChangeAuditStamps(
            created=AuditStamp(time=sys_time, actor=actor),
            lastModified=AuditStamp(time=sys_time, actor=actor),
        )

        chart_type = self._get_chart_type(dashboard_element)

        chart_info = ChartInfoClass(
            type=chart_type,
            description=dashboard_element.description
            if dashboard_element.description is not None else "",
            title=dashboard_element.title
            if dashboard_element.title is not None else "",
            lastModified=last_modified,
            chartUrl=dashboard_element.url(self.source_config.base_url),
            inputs=dashboard_element.get_view_urns(
                self.source_config.platform_name),
        )
        chart_snapshot.aspects.append(chart_info)

        return MetadataChangeEvent(proposedSnapshot=chart_snapshot)
Ejemplo n.º 14
0
    def _build_dataset_mce(self,
                           looker_view: LookerView) -> MetadataChangeEvent:
        """
        Creates MetadataChangeEvent for the dataset, creating upstream lineage links
        """
        logger.debug(f"looker_view = {looker_view.view_name}")

        dataset_name = looker_view.view_name
        actor = self.source_config.actor
        sys_time = get_sys_time()

        dataset_snapshot = DatasetSnapshot(
            urn=
            f"urn:li:dataset:(urn:li:dataPlatform:{self.source_config.platform_name},{dataset_name},{self.source_config.env})",
            aspects=[],  # we append to this list later on
        )
        dataset_snapshot.aspects.append(Status(removed=False))
        dataset_snapshot.aspects.append(
            self._get_upsteam_lineage(looker_view, actor, sys_time))
        dataset_snapshot.aspects.append(
            self._get_schema(looker_view, actor, sys_time))

        mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot)

        return mce
Ejemplo n.º 15
0
    def emit_dashboard_mces(self) -> Iterable[MetadataWorkUnit]:
        current_dashboard_page = 0
        # we will set total dashboards to the actual number after we get the response
        total_dashboards = PAGE_SIZE

        while current_dashboard_page * PAGE_SIZE <= total_dashboards:
            dashboard_response = self.session.get(
                f"{self.config.connect_uri}/api/v1/dashboard",
                params=
                f"q=(page:{current_dashboard_page},page_size:{PAGE_SIZE})",
            )
            payload = dashboard_response.json()
            total_dashboards = payload.get("count") or 0

            current_dashboard_page += 1

            payload = dashboard_response.json()
            for dashboard_data in payload["result"]:
                dashboard_snapshot = self.construct_dashboard_from_api_data(
                    dashboard_data)
                mce = MetadataChangeEvent(proposedSnapshot=dashboard_snapshot)
                wu = MetadataWorkUnit(id=dashboard_snapshot.urn, mce=mce)
                self.report.report_workunit(wu)

                yield wu
Ejemplo n.º 16
0
    def _get_entity_workunit(
        self, feature_view: FeatureView, entity: Entity
    ) -> MetadataWorkUnit:
        """
        Generate an MLPrimaryKey work unit for a Feast entity.
        """

        feature_view_name = f"{self.feature_store.project}.{feature_view.name}"

        entity_snapshot = MLPrimaryKeySnapshot(
            urn=builder.make_ml_primary_key_urn(feature_view_name, entity.name),
            aspects=[StatusClass(removed=False)],
        )

        entity_snapshot.aspects.append(
            MLPrimaryKeyPropertiesClass(
                description=entity.description,
                dataType=self._get_field_type(entity.value_type, entity.name),
                sources=self._get_data_sources(feature_view),
            )
        )

        mce = MetadataChangeEvent(proposedSnapshot=entity_snapshot)

        return MetadataWorkUnit(id=entity.name, mce=mce)
Ejemplo n.º 17
0
    def _build_dataset_mce(self,
                           looker_view: LookerView) -> MetadataChangeEvent:
        """
        Creates MetadataChangeEvent for the dataset, creating upstream lineage links
        """
        logger.debug(f"looker_view = {looker_view.id}")

        dataset_snapshot = DatasetSnapshot(
            urn=looker_view.id.get_urn(self.source_config),
            aspects=[],  # we append to this list later on
        )
        browse_paths = BrowsePaths(
            paths=[looker_view.id.get_browse_path(self.source_config)])
        dataset_snapshot.aspects.append(browse_paths)
        dataset_snapshot.aspects.append(Status(removed=False))
        upstream_lineage = self._get_upstream_lineage(looker_view)
        if upstream_lineage is not None:
            dataset_snapshot.aspects.append(upstream_lineage)
        schema_metadata = LookerUtil._get_schema(
            self.source_config.platform_name,
            looker_view.id.view_name,
            looker_view.fields,
            self.reporter,
        )
        if schema_metadata is not None:
            dataset_snapshot.aspects.append(schema_metadata)
        dataset_snapshot.aspects.append(
            self._get_custom_properties(looker_view))

        mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot)

        return mce
Ejemplo n.º 18
0
    def _make_chart_mce(
        self, dashboard_element: LookerDashboardElement, dashboard: LookerDashboard
    ) -> MetadataChangeEvent:
        chart_urn = builder.make_chart_urn(
            self.source_config.platform_name, dashboard_element.get_urn_element_id()
        )
        chart_snapshot = ChartSnapshot(
            urn=chart_urn,
            aspects=[],
        )

        chart_type = self._get_chart_type(dashboard_element)

        chart_info = ChartInfoClass(
            type=chart_type,
            description=dashboard_element.description or "",
            title=dashboard_element.title or "",
            lastModified=ChangeAuditStamps(),
            chartUrl=dashboard_element.url(self.source_config.base_url),
            inputs=dashboard_element.get_view_urns(self.source_config),
            customProperties={
                "upstream_fields": ",".join(
                    sorted(set(dashboard_element.upstream_fields))
                )
                if dashboard_element.upstream_fields
                else ""
            },
        )
        chart_snapshot.aspects.append(chart_info)

        ownership = self.get_ownership(dashboard)
        if ownership is not None:
            chart_snapshot.aspects.append(ownership)

        return MetadataChangeEvent(proposedSnapshot=chart_snapshot)
Ejemplo n.º 19
0
    def loop_tables(
        self,
        inspector: Inspector,
        schema: str,
        sql_config: SQLAlchemyConfig,
    ) -> Iterable[SqlWorkUnit]:
        for table in inspector.get_table_names(schema):
            schema, table = self.standardize_schema_table_names(schema=schema,
                                                                entity=table)
            dataset_name = self.get_identifier(schema=schema,
                                               entity=table,
                                               inspector=inspector)
            self.report.report_entity_scanned(dataset_name, ent_type="table")

            if not sql_config.table_pattern.allowed(dataset_name):
                self.report.report_dropped(dataset_name)
                continue

            columns = inspector.get_columns(table, schema)
            if len(columns) == 0:
                self.report.report_warning(dataset_name,
                                           "missing column information")

            try:
                # SQLALchemy stubs are incomplete and missing this method.
                # PR: https://github.com/dropbox/sqlalchemy-stubs/pull/223.
                table_info: dict = inspector.get_table_comment(
                    table, schema)  # type: ignore
            except NotImplementedError:
                description: Optional[str] = None
                properties: Dict[str, str] = {}
            else:
                description = table_info["text"]

                # The "properties" field is a non-standard addition to SQLAlchemy's interface.
                properties = table_info.get("properties", {})

            # TODO: capture inspector.get_pk_constraint
            # TODO: capture inspector.get_sorted_table_and_fkc_names

            dataset_snapshot = DatasetSnapshot(
                urn=
                f"urn:li:dataset:(urn:li:dataPlatform:{self.platform},{dataset_name},{self.config.env})",
                aspects=[],
            )
            if description is not None or properties:
                dataset_properties = DatasetPropertiesClass(
                    description=description,
                    customProperties=properties,
                )
                dataset_snapshot.aspects.append(dataset_properties)
            schema_metadata = get_schema_metadata(self.report, dataset_name,
                                                  self.platform, columns)
            dataset_snapshot.aspects.append(schema_metadata)

            mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot)
            wu = SqlWorkUnit(id=dataset_name, mce=mce)
            self.report.report_workunit(wu)
            yield wu
Ejemplo n.º 20
0
    def get_workunits(self) -> Iterable[SqlWorkUnit]:
        sql_config = self.config
        if logger.isEnabledFor(logging.DEBUG):
            # If debug logging is enabled, we also want to echo each SQL query issued.
            sql_config.options["echo"] = True

        url = sql_config.get_sql_alchemy_url()
        logger.debug(f"sql_alchemy_url={url}")
        engine = create_engine(url, **sql_config.options)
        inspector = reflection.Inspector.from_engine(engine)
        for schema in inspector.get_schema_names():
            if not sql_config.schema_pattern.allowed(schema):
                self.report.report_dropped(schema)
                continue

            for table in inspector.get_table_names(schema):
                schema, table = sql_config.standardize_schema_table_names(schema, table)
                dataset_name = sql_config.get_identifier(schema, table)
                self.report.report_table_scanned(dataset_name)

                if not sql_config.table_pattern.allowed(dataset_name):
                    self.report.report_dropped(dataset_name)
                    continue

                columns = inspector.get_columns(table, schema)
                try:
                    table_info: dict = inspector.get_table_comment(table, schema)
                except NotImplementedError:
                    description: Optional[str] = None
                    properties: Dict[str, str] = {}
                else:
                    description = table_info["text"]

                    # The "properties" field is a non-standard addition to SQLAlchemy's interface.
                    properties = table_info.get("properties", {})

                # TODO: capture inspector.get_pk_constraint
                # TODO: capture inspector.get_sorted_table_and_fkc_names

                dataset_snapshot = DatasetSnapshot(
                    urn=f"urn:li:dataset:(urn:li:dataPlatform:{self.platform},{dataset_name},{self.config.env})",
                    aspects=[],
                )
                if description is not None or properties:
                    dataset_properties = DatasetPropertiesClass(
                        description=description,
                        customProperties=properties,
                        # uri=dataset_name,
                    )
                    dataset_snapshot.aspects.append(dataset_properties)
                schema_metadata = get_schema_metadata(
                    self.report, dataset_name, self.platform, columns
                )
                dataset_snapshot.aspects.append(schema_metadata)

                mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot)
                wu = SqlWorkUnit(id=dataset_name, mce=mce)
                self.report.report_workunit(wu)
                yield wu
Ejemplo n.º 21
0
def iterate_mce_file(path: str) -> Iterator[MetadataChangeEvent]:
    with open(path, "r") as f:
        mce_obj_list = json.load(f)
    if not isinstance(mce_obj_list, list):
        mce_obj_list = [mce_obj_list]

    for obj in mce_obj_list:
        mce: MetadataChangeEvent = MetadataChangeEvent.from_obj(obj)
        yield mce
Ejemplo n.º 22
0
def iterate_generic_file(
    path: str,
) -> Iterator[Union[MetadataChangeEvent, UsageAggregationClass]]:
    for obj in _iterate_file(path):
        if "proposedSnapshot" in obj:
            mce: MetadataChangeEvent = MetadataChangeEvent.from_obj(obj)
            yield mce
        else:
            bucket: UsageAggregationClass = UsageAggregationClass.from_obj(obj)
            yield bucket
Ejemplo n.º 23
0
    def _get_feature_workunit(
        self,
        feature_view: Union[FeatureView, OnDemandFeatureView],
        feature: Feature,
    ) -> MetadataWorkUnit:
        """
        Generate an MLFeature work unit for a Feast feature.
        """
        feature_view_name = f"{self.feature_store.project}.{feature_view.name}"

        feature_snapshot = MLFeatureSnapshot(
            urn=builder.make_ml_feature_urn(feature_view_name, feature.name),
            aspects=[StatusClass(removed=False)],
        )

        feature_sources = []

        if isinstance(feature_view, FeatureView):
            feature_sources = self._get_data_sources(feature_view)
        elif isinstance(feature_view, OnDemandFeatureView):
            if feature_view.input_request_data_sources is not None:
                for request_source in feature_view.input_request_data_sources.values():
                    source_platform, source_name = self._get_data_source_details(
                        request_source
                    )

                    feature_sources.append(
                        builder.make_dataset_urn(
                            source_platform,
                            source_name,
                            self.source_config.environment,
                        )
                    )

            if feature_view.input_feature_view_projections is not None:
                for (
                    feature_view_projection
                ) in feature_view.input_feature_view_projections.values():
                    feature_view_source = self.feature_store.get_feature_view(
                        feature_view_projection.name
                    )

                    feature_sources.extend(self._get_data_sources(feature_view_source))

        feature_snapshot.aspects.append(
            MLFeaturePropertiesClass(
                description=feature.labels.get("description"),
                dataType=self._get_field_type(feature.dtype, feature.name),
                sources=feature_sources,
            )
        )

        mce = MetadataChangeEvent(proposedSnapshot=feature_snapshot)

        return MetadataWorkUnit(id=feature.name, mce=mce)
Ejemplo n.º 24
0
    def get_entity_wu(self, ingest_table, ingest_entity):
        """
        Generate an MLPrimaryKey workunit for a Feast entity.

        Parameters
        ----------
            ingest_table:
                ingested Feast table
            ingest_entity:
                ingested Feast entity
        """

        # create snapshot instance for the entity
        entity_snapshot = MLPrimaryKeySnapshot(
            urn=builder.make_ml_primary_key_urn(
                ingest_table["name"], ingest_entity["name"]
            ),
            aspects=[],
        )

        entity_sources = []

        if ingest_entity["batch_source"] is not None:
            entity_sources.append(
                builder.make_dataset_urn(
                    ingest_entity["batch_source_platform"],
                    ingest_entity["batch_source_name"],
                    self.config.env,
                )
            )

        if ingest_entity["stream_source"] is not None:
            entity_sources.append(
                builder.make_dataset_urn(
                    ingest_entity["stream_source_platform"],
                    ingest_entity["stream_source_name"],
                    self.config.env,
                )
            )

        # append entity name and type
        entity_snapshot.aspects.append(
            MLPrimaryKeyPropertiesClass(
                description=ingest_entity["description"],
                dataType=self.get_field_type(
                    ingest_entity["type"], ingest_entity["name"]
                ),
                sources=entity_sources,
            )
        )

        # make the MCE and workunit
        mce = MetadataChangeEvent(proposedSnapshot=entity_snapshot)
        return MetadataWorkUnit(id=ingest_entity["name"], mce=mce)
Ejemplo n.º 25
0
    def get_workunits(self) -> Iterable[MetadataWorkUnit]:
        with open(self.config.filename, 'r') as f:
            mce_obj_list = json.load(f)
        if not isinstance(mce_obj_list, list):
            mce_obj_list = [mce_obj_list]

        for i, obj in enumerate(mce_obj_list):
            mce: MetadataChangeEvent = MetadataChangeEvent.from_obj(obj)
            wu = MetadataWorkUnit(f"file://{self.config.filename}:{i}", mce)
            self.report.report_workunit(wu)
            yield wu
Ejemplo n.º 26
0
 def _get_tag_mce_for_urn(tag_urn: str) -> MetadataChangeEvent:
     assert tag_urn in LookerUtil.tag_definitions
     ownership = OwnershipClass(owners=[
         OwnerClass(
             owner="urn:li:corpuser:datahub",
             type=OwnershipTypeClass.DATAOWNER,
         )
     ])
     return MetadataChangeEvent(proposedSnapshot=TagSnapshotClass(
         urn=tag_urn,
         aspects=[ownership, LookerUtil.tag_definitions[tag_urn]]))
Ejemplo n.º 27
0
    def loop_views(
        self,
        inspector: Any,
        schema: str,
        sql_config: SQLAlchemyConfig,
    ) -> Iterable[SqlWorkUnit]:
        for view in inspector.get_view_names(schema):
            schema, view = sql_config.standardize_schema_table_names(
                schema, view)
            dataset_name = sql_config.get_identifier(schema, view)
            self.report.report_entity_scanned(dataset_name, ent_type="view")

            if not sql_config.view_pattern.allowed(dataset_name):
                self.report.report_dropped(dataset_name)
                continue

            columns = inspector.get_columns(view, schema)
            try:
                view_info: dict = inspector.get_table_comment(view, schema)
            except NotImplementedError:
                description: Optional[str] = None
                properties: Dict[str, str] = {}
            else:
                description = view_info["text"]

                # The "properties" field is a non-standard addition to SQLAlchemy's interface.
                properties = view_info.get("properties", {})

            view_definition = inspector.get_view_definition(view)
            if view_definition is None:
                view_definition = ""
            properties["view_definition"] = view_definition
            properties["is_view"] = "True"

            dataset_snapshot = DatasetSnapshot(
                urn=
                f"urn:li:dataset:(urn:li:dataPlatform:{self.platform},{dataset_name},{self.config.env})",
                aspects=[],
            )
            if description is not None or properties:
                dataset_properties = DatasetPropertiesClass(
                    description=description,
                    customProperties=properties,
                    # uri=dataset_name,
                )
                dataset_snapshot.aspects.append(dataset_properties)
            schema_metadata = get_schema_metadata(self.report, dataset_name,
                                                  self.platform, columns)
            dataset_snapshot.aspects.append(schema_metadata)

            mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot)
            wu = SqlWorkUnit(id=dataset_name, mce=mce)
            self.report.report_workunit(wu)
            yield wu
Ejemplo n.º 28
0
    def get_workunits(self) -> Iterable[MetadataWorkUnit]:
        env = "PROD"
        platform = "mongodb"

        database_names: List[str] = self.mongo_client.list_database_names()
        for database_name in database_names:
            if database_name in DENY_DATABASE_LIST:
                continue
            if not self.config.database_pattern.allowed(database_name):
                self.report.report_dropped(database_name)
                continue

            database = self.mongo_client[database_name]
            collection_names: List[str] = database.list_collection_names()
            for collection_name in collection_names:
                dataset_name = f"{database_name}.{collection_name}"
                if not self.config.collection_pattern.allowed(dataset_name):
                    self.report.report_dropped(dataset_name)
                    continue

                mce = MetadataChangeEvent()
                dataset_snapshot = DatasetSnapshot()
                dataset_snapshot.urn = f"urn:li:dataset:(urn:li:dataPlatform:{platform},{dataset_name},{env})"

                dataset_properties = DatasetPropertiesClass(
                    tags=[],
                    customProperties={},
                )
                dataset_snapshot.aspects.append(dataset_properties)

                # TODO: Guess the schema via sampling
                # State of the art seems to be https://github.com/variety/variety.

                # TODO: use list_indexes() or index_information() to get index information
                # See https://pymongo.readthedocs.io/en/stable/api/pymongo/collection.html#pymongo.collection.Collection.list_indexes.

                mce.proposedSnapshot = dataset_snapshot

                wu = MetadataWorkUnit(id=dataset_name, mce=mce)
                self.report.report_workunit(wu)
                yield wu
Ejemplo n.º 29
0
    def get_feature_wu(self, ingest_table, ingest_feature):
        """
        Generate an MLFeature workunit for a Feast feature.

        Parameters
        ----------
            ingest_table:
                ingested Feast table
            ingest_feature:
                ingested Feast feature
        """

        # create snapshot instance for the feature
        feature_snapshot = MLFeatureSnapshot(
            urn=builder.make_ml_feature_urn(
                ingest_table["name"], ingest_feature["name"]
            ),
            aspects=[],
        )

        feature_sources = []

        if ingest_feature["batch_source"] is not None:
            feature_sources.append(
                builder.make_dataset_urn(
                    ingest_feature["batch_source_platform"],
                    ingest_feature["batch_source_name"],
                    self.config.env,
                )
            )

        if ingest_feature["stream_source"] is not None:
            feature_sources.append(
                builder.make_dataset_urn(
                    ingest_feature["stream_source_platform"],
                    ingest_feature["stream_source_name"],
                    self.config.env,
                )
            )

        # append feature name and type
        feature_snapshot.aspects.append(
            MLFeaturePropertiesClass(
                dataType=self.get_field_type(
                    ingest_feature["type"], ingest_feature["name"]
                ),
                sources=feature_sources,
            )
        )

        # make the MCE and workunit
        mce = MetadataChangeEvent(proposedSnapshot=feature_snapshot)
        return MetadataWorkUnit(id=ingest_feature["name"], mce=mce)
Ejemplo n.º 30
0
    def get_feature_group_wu(
        self, feature_group_details: Dict[str, Any]
    ) -> MetadataWorkUnit:
        """
        Generate an MLFeatureTable workunit for a SageMaker feature group.

        Parameters
        ----------
            feature_group_details:
                ingested SageMaker feature group from get_feature_group_details()
        """

        feature_group_name = feature_group_details["FeatureGroupName"]

        feature_group_snapshot = MLFeatureTableSnapshot(
            urn=builder.make_ml_feature_table_urn("sagemaker", feature_group_name),
            aspects=[
                BrowsePathsClass(paths=[f"sagemaker/{feature_group_name}"]),
            ],
        )

        feature_group_snapshot.aspects.append(
            MLFeatureTablePropertiesClass(
                description=feature_group_details.get("Description"),
                # non-primary key features
                mlFeatures=[
                    builder.make_ml_feature_urn(
                        feature_group_name,
                        feature["FeatureName"],
                    )
                    for feature in feature_group_details["FeatureDefinitions"]
                    if feature["FeatureName"]
                    != feature_group_details["RecordIdentifierFeatureName"]
                ],
                mlPrimaryKeys=[
                    builder.make_ml_primary_key_urn(
                        feature_group_name,
                        feature_group_details["RecordIdentifierFeatureName"],
                    )
                ],
                # additional metadata
                customProperties={
                    "arn": feature_group_details["FeatureGroupArn"],
                    "creation_time": str(feature_group_details["CreationTime"]),
                    "status": feature_group_details["FeatureGroupStatus"],
                },
            )
        )

        # make the MCE and workunit
        mce = MetadataChangeEvent(proposedSnapshot=feature_group_snapshot)
        return MetadataWorkUnit(id=feature_group_name, mce=mce)