Ejemplo n.º 1
0
    def emit_dashboard_mces(self) -> Iterable[MetadataWorkUnit]:
        current_dashboard_page = 0
        # we will set total dashboards to the actual number after we get the response
        total_dashboards = PAGE_SIZE

        while current_dashboard_page * PAGE_SIZE <= total_dashboards:
            dashboard_response = self.session.get(
                f"{self.config.connect_uri}/api/v1/dashboard",
                params=
                f"q=(page:{current_dashboard_page},page_size:{PAGE_SIZE})",
            )
            payload = dashboard_response.json()
            total_dashboards = payload.get("count") or 0

            current_dashboard_page += 1

            payload = dashboard_response.json()
            for dashboard_data in payload["result"]:
                dashboard_snapshot = self.construct_dashboard_from_api_data(
                    dashboard_data)
                mce = MetadataChangeEvent(proposedSnapshot=dashboard_snapshot)
                wu = MetadataWorkUnit(id=dashboard_snapshot.urn, mce=mce)
                self.report.report_workunit(wu)

                yield wu
Ejemplo n.º 2
0
    def get_workunits(self) -> Iterable[MetadataWorkUnit]:
        client = self._get_looker_client()
        dashboard_ids = [
            dashboard_base.id
            for dashboard_base in client.all_dashboards(fields="id")
            if dashboard_base.id is not None
        ]

        for dashboard_id in dashboard_ids:
            self.reporter.report_dashboards_scanned()
            if not self.source_config.dashboard_pattern.allowed(dashboard_id):
                self.reporter.report_dashboards_dropped(dashboard_id)
                continue
            try:
                fields = [
                    "id", "title", "dashboard_elements", "dashboard_filters"
                ]
                dashboard_object = client.dashboard(dashboard_id=dashboard_id,
                                                    fields=",".join(fields))
            except SDKError:
                # A looker dashboard could be deleted in between the list and the get
                logger.warning(
                    f"Error occuried while loading dashboard {dashboard_id}. Skipping."
                )
                continue

            looker_dashboard = self._get_looker_dashboard(dashboard_object)
            mces = self._make_dashboard_and_chart_mces(looker_dashboard)
            for mce in mces:
                workunit = MetadataWorkUnit(
                    id=f"looker-{mce.proposedSnapshot.urn}", mce=mce)
                self.reporter.report_workunit(workunit)
                yield workunit
Ejemplo n.º 3
0
    def construct_flow_workunit(
        self, connector: ConnectorManifest
    ) -> Iterable[MetadataWorkUnit]:
        connector_name = connector.name
        connector_type = connector.type
        connector_class = connector.config.get("connector.class")
        # connector_url = connector.url  # NOTE: this will expose connector credential when used
        flow_urn = builder.make_data_flow_urn(
            "kafka-connect", connector_name, self.config.env
        )
        flow_property_bag: Optional[Dict[str, str]] = None
        mce = models.MetadataChangeEventClass(
            proposedSnapshot=models.DataFlowSnapshotClass(
                urn=flow_urn,
                aspects=[
                    models.DataFlowInfoClass(
                        name=connector_name,
                        description=f"{connector_type.capitalize()} connector using `{connector_class}` plugin.",
                        customProperties=flow_property_bag,
                        # externalUrl=connector_url, # NOTE: this will expose connector credential when used
                    ),
                    # ownership,
                    # tags,
                ],
            )
        )

        for c in [connector_name]:
            wu = MetadataWorkUnit(id=c, mce=mce)
            self.report.report_workunit(wu)
            yield wu
Ejemplo n.º 4
0
    def get_datajob_wu(self, node: Dict[str, Any],
                       job: Dict[str, Any]) -> MetadataWorkUnit:
        """
        Generate a DataJob workunit for a component (node) in a Glue job.

        Parameters
        ----------
            node:
                Node from process_dataflow_graph()
            job:
                Job object from get_all_jobs()
        """
        mce = MetadataChangeEventClass(proposedSnapshot=DataJobSnapshotClass(
            urn=node["urn"],
            aspects=[
                DataJobInfoClass(
                    name=f"{job['Name']}:{node['NodeType']}-{node['Id']}",
                    type="GLUE",
                    customProperties={
                        **{x["Name"]: x["Value"]
                           for x in node["Args"]},
                        "transformType": node["NodeType"],
                        "nodeId": node["Id"],
                    },
                ),
                DataJobInputOutputClass(
                    inputDatasets=node["inputDatasets"],
                    outputDatasets=node["outputDatasets"],
                    inputDatajobs=node["inputDatajobs"],
                ),
            ],
        ))

        return MetadataWorkUnit(id=f'{job["Name"]}-{node["Id"]}', mce=mce)
Ejemplo n.º 5
0
    def get_dataflow_wu(self, flow_urn: str,
                        job: Dict[str, Any]) -> MetadataWorkUnit:
        """
        Generate a DataFlow workunit for a Glue job.

        Parameters
        ----------
            flow_urn:
                URN for the flow
            job:
                Job object from get_all_jobs()
        """
        mce = MetadataChangeEventClass(proposedSnapshot=DataFlowSnapshotClass(
            urn=flow_urn,
            aspects=[
                DataFlowInfoClass(
                    name=job["Name"],
                    description=job["Description"],
                    # specify a few Glue-specific properties
                    customProperties={
                        "role": job["Role"],
                        "created": str(job["CreatedOn"]),
                        "modified": str(job["LastModifiedOn"]),
                        "command": job["Command"]["ScriptLocation"],
                    },
                ),
            ],
        ))

        return MetadataWorkUnit(id=job["Name"], mce=mce)
Ejemplo n.º 6
0
    def get_workunits(self) -> Iterable[MetadataWorkUnit]:
        env: str = "PROD"
        platform = self.platform
        nodes = loadManifestAndCatalog(
            self.config.manifest_path, self.config.catalog_path, platform, env
        )

        for node in nodes:
            mce = MetadataChangeEvent()

            dataset_snapshot = DatasetSnapshot()
            dataset_snapshot.urn = node.datahub_urn
            custom_properties = get_custom_properties(node)

            dbt_properties = DatasetPropertiesClass()
            dbt_properties.description = node.dbt_name
            dbt_properties.customProperties = custom_properties

            dataset_snapshot.aspects.append(dbt_properties)

            upstreams = get_upstream_lineage(node.upstream_urns)
            if upstreams is not None:
                dataset_snapshot.aspects.append(upstreams)

            schema_metadata = get_schema_metadata(self.report, node, platform)
            dataset_snapshot.aspects.append(schema_metadata)

            mce.proposedSnapshot = dataset_snapshot
            wu = MetadataWorkUnit(id=dataset_snapshot.urn, mce=mce)
            self.report.report_workunit(wu)

            yield wu
Ejemplo n.º 7
0
    def handle_user(self, dn: str,
                    attrs: Dict[str, Any]) -> Iterable[MetadataWorkUnit]:
        """
        Handle a DN and attributes by adding manager info and constructing a
        work unit based on the information.
        """
        manager_ldap = None
        if "manager" in attrs:
            try:
                m_cn = attrs["manager"][0].decode()
                manager_msgid = self.ldap_client.search_ext(
                    m_cn,
                    ldap.SCOPE_BASE,
                    self.config.filter,
                    serverctrls=[self.lc],
                )
                _m_dn, m_attrs = self.ldap_client.result3(manager_msgid)[1][0]
                manager_ldap = guess_person_ldap(m_attrs)
            except ldap.LDAPError as e:
                self.report.report_warning(
                    dn, "manager LDAP search failed: {}".format(e))

        mce = self.build_corp_user_mce(dn, attrs, manager_ldap)
        if mce:
            wu = MetadataWorkUnit(dn, mce)
            self.report.report_workunit(wu)
            yield wu
        else:
            self.report.report_dropped(dn)
Ejemplo n.º 8
0
    def handle_user(self, dn, attrs) -> Iterable[MetadataWorkUnit]:
        """
        Handle a DN and attributes by adding manager info and constructing a
        work unit based on the information.
        """
        manager_ldap = None
        if "manager" in attrs:
            try:
                m_cn = attrs["manager"][0].split(b",")[0]
                manager_msgid = self.ldap_client.search_ext(
                    self.config.base_dn,
                    ldap.SCOPE_SUBTREE,
                    f"({m_cn.decode()})",
                    serverctrls=[self.lc],
                )
                m_dn, m_attrs = self.ldap_client.result3(manager_msgid)[1][0]
                manager_ldap = guess_person_ldap(m_dn, m_attrs)
            except ldap.LDAPError as e:
                self.report.report_warning(
                    dn, "manager LDAP search failed: {}".format(e)
                )

        mce = self.build_corp_user_mce(dn, attrs, manager_ldap)
        if mce:
            wu = MetadataWorkUnit(dn, mce)
            self.report.report_workunit(wu)
            yield wu
        yield from []
Ejemplo n.º 9
0
 def get_workunits(self) -> Iterable[MetadataWorkUnit]:
     for i, mce in enumerate(iterate_mce_file(self.config.filename)):
         if not mce.validate():
             raise ValueError(
                 f"failed to parse into valid MCE: {mce} (index {i})")
         wu = MetadataWorkUnit(f"file://{self.config.filename}:{i}", mce)
         self.report.report_workunit(wu)
         yield wu
Ejemplo n.º 10
0
    def handle_group(self, dn: str,
                     attrs: Dict[str, Any]) -> Iterable[MetadataWorkUnit]:
        """Creates a workunit for LDAP groups."""

        mce = self.build_corp_group_mce(attrs)
        if mce:
            wu = MetadataWorkUnit(dn, mce)
            self.report.report_workunit(wu)
            yield wu
        yield from []
Ejemplo n.º 11
0
def create_metadata_work_unit(timestamp):
    dataset_snapshot = DatasetSnapshot(
        urn="urn:li:dataset:(urn:li:dataPlatform:glue,datalake_grilled.Barbeque,PROD)",
        aspects=[],
    )

    dataset_snapshot.aspects.append(Status(removed=False))

    dataset_snapshot.aspects.append(
        OwnershipClass(
            owners=[
                OwnerClass(
                    owner="urn:li:corpuser:Susan", type=OwnershipTypeClass.DATAOWNER
                )
            ],
            lastModified=AuditStampClass(
                time=timestamp, actor="urn:li:corpuser:datahub"
            ),
        )
    )

    dataset_snapshot.aspects.append(
        DatasetPropertiesClass(
            description="Grilled Food",
            customProperties={},
            uri=None,
            tags=[],
        )
    )

    fields = [
        SchemaField(
            fieldPath="Size",
            nativeDataType="int",
            type=SchemaFieldDataType(type=NumberTypeClass()),
            description="Maximum attendees permitted",
            nullable=True,
            recursive=False,
        )
    ]

    schema_metadata = SchemaMetadata(
        schemaName="datalake_grilled.Barbeque",
        version=0,
        fields=fields,
        platform="urn:li:dataPlatform:glue",
        created=AuditStamp(time=timestamp, actor="urn:li:corpuser:etl"),
        lastModified=AuditStamp(time=timestamp, actor="urn:li:corpuser:etl"),
        hash="",
        platformSchema=MySqlDDL(tableSchema=""),
    )
    dataset_snapshot.aspects.append(schema_metadata)

    mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot)
    return MetadataWorkUnit(id="glue-datalake_grilled.Barbeque", mce=mce)
Ejemplo n.º 12
0
    def get_workunits(self) -> Iterable[MetadataWorkUnit]:
        def get_all_tables() -> List[dict]:
            def get_tables_from_database(database_name: str,
                                         tables: List) -> List[dict]:
                kwargs = {"DatabaseName": database_name}
                while True:
                    data = self.glue_client.get_tables(**kwargs)
                    tables += data["TableList"]
                    if "NextToken" in data:
                        kwargs["NextToken"] = data["NextToken"]
                    else:
                        break
                return tables

            def get_tables_from_all_databases() -> List[dict]:
                tables = []
                kwargs: Dict = {}
                while True:
                    data = self.glue_client.search_tables(**kwargs)
                    tables += data["TableList"]
                    if "NextToken" in data:
                        kwargs["NextToken"] = data["NextToken"]
                    else:
                        break
                return tables

            if self.source_config.database_pattern.is_fully_specified_allow_list(
            ):
                all_tables: List[dict] = []
                database_names = self.source_config.database_pattern.get_allowed_list(
                )
                for database in database_names:
                    all_tables += get_tables_from_database(
                        database, all_tables)
            else:
                all_tables = get_tables_from_all_databases()
            return all_tables

        tables = get_all_tables()

        for table in tables:
            database_name = table["DatabaseName"]
            table_name = table["Name"]
            full_table_name = f"{database_name}.{table_name}"
            self.report.report_table_scanned()
            if not self.source_config.database_pattern.allowed(
                    database_name
            ) or not self.source_config.table_pattern.allowed(full_table_name):
                self.report.report_table_dropped(full_table_name)
                continue

            mce = self._extract_record(table, full_table_name)
            workunit = MetadataWorkUnit(id=f"glue-{full_table_name}", mce=mce)
            self.report.report_workunit(workunit)
            yield workunit
Ejemplo n.º 13
0
    def get_workunits(self) -> Iterable[MetadataWorkUnit]:
        with open(self.config.filename, 'r') as f:
            mce_obj_list = json.load(f)
        if not isinstance(mce_obj_list, list):
            mce_obj_list = [mce_obj_list]

        for i, obj in enumerate(mce_obj_list):
            mce: MetadataChangeEvent = MetadataChangeEvent.from_obj(obj)
            wu = MetadataWorkUnit(f"file://{self.config.filename}:{i}", mce)
            self.report.report_workunit(wu)
            yield wu
Ejemplo n.º 14
0
    def get_entity_wu(self, ingest_table, ingest_entity):
        """
        Generate an MLPrimaryKey workunit for a Feast entity.

        Parameters
        ----------
            ingest_table:
                ingested Feast table
            ingest_entity:
                ingested Feast entity
        """

        # create snapshot instance for the entity
        entity_snapshot = MLPrimaryKeySnapshot(
            urn=builder.make_ml_primary_key_urn(
                ingest_table["name"], ingest_entity["name"]
            ),
            aspects=[],
        )

        entity_sources = []

        if ingest_entity["batch_source"] is not None:
            entity_sources.append(
                builder.make_dataset_urn(
                    ingest_entity["batch_source_platform"],
                    ingest_entity["batch_source_name"],
                    self.config.env,
                )
            )

        if ingest_entity["stream_source"] is not None:
            entity_sources.append(
                builder.make_dataset_urn(
                    ingest_entity["stream_source_platform"],
                    ingest_entity["stream_source_name"],
                    self.config.env,
                )
            )

        # append entity name and type
        entity_snapshot.aspects.append(
            MLPrimaryKeyPropertiesClass(
                description=ingest_entity["description"],
                dataType=self.get_field_type(
                    ingest_entity["type"], ingest_entity["name"]
                ),
                sources=entity_sources,
            )
        )

        # make the MCE and workunit
        mce = MetadataChangeEvent(proposedSnapshot=entity_snapshot)
        return MetadataWorkUnit(id=ingest_entity["name"], mce=mce)
Ejemplo n.º 15
0
    def get_workunits(self) -> Iterable[MetadataWorkUnit]:
        viewfile_loader = LookerViewFileLoader(self.source_config.base_folder)

        model_files = sorted(
            f
            for f in glob.glob(
                f"{self.source_config.base_folder}/**/*.model.lkml", recursive=True
            )
        )
        for file_path in model_files:
            model_name = Path(file_path).stem
            self.reporter.report_models_scanned()
            if not self.source_config.model_pattern.allowed(model_name):
                self.reporter.report_models_dropped(model_name)
                continue
            try:
                model = self._load_model(file_path)
            except Exception:
                self.reporter.report_warning(
                    "LookML", f"unable to parse Looker model: {file_path}"
                )
                continue

            for include in model.resolved_includes:
                is_view_seen = viewfile_loader.is_view_seen(include)
                if is_view_seen:
                    continue
                looker_viewfile = viewfile_loader.load_viewfile(
                    include, model.connection
                )
                if looker_viewfile is not None:
                    for raw_view in looker_viewfile.views:
                        maybe_looker_view = LookerView.from_looker_dict(
                            raw_view,
                            model.connection,
                            looker_viewfile,
                            viewfile_loader,
                            self.source_config.parse_table_names_from_sql,
                        )
                        if maybe_looker_view:
                            self.reporter.report_views_scanned()
                            if self.source_config.view_pattern.allowed(
                                maybe_looker_view.view_name
                            ):
                                mce = self._build_dataset_mce(maybe_looker_view)
                                workunit = MetadataWorkUnit(
                                    id=f"lookml-{maybe_looker_view.view_name}", mce=mce
                                )
                                self.reporter.report_workunit(workunit)
                                yield workunit
                            else:
                                self.reporter.report_views_dropped(
                                    maybe_looker_view.view_name
                                )
Ejemplo n.º 16
0
    def get_workunits(self) -> Iterable[MetadataWorkUnit]:
        topics = self.consumer.list_topics().topics
        for t in topics:
            self.report.report_topic_scanned(t)

            if self.source_config.topic_patterns.allowed(t):
                mce = self._extract_record(t)
                wu = MetadataWorkUnit(id=f"kafka-{t}", mce=mce)
                self.report.report_workunit(wu)
                yield wu
            else:
                self.report.report_dropped(t)
Ejemplo n.º 17
0
    def get_feature_wu(self, ingest_table, ingest_feature):
        """
        Generate an MLFeature workunit for a Feast feature.

        Parameters
        ----------
            ingest_table:
                ingested Feast table
            ingest_feature:
                ingested Feast feature
        """

        # create snapshot instance for the feature
        feature_snapshot = MLFeatureSnapshot(
            urn=builder.make_ml_feature_urn(
                ingest_table["name"], ingest_feature["name"]
            ),
            aspects=[],
        )

        feature_sources = []

        if ingest_feature["batch_source"] is not None:
            feature_sources.append(
                builder.make_dataset_urn(
                    ingest_feature["batch_source_platform"],
                    ingest_feature["batch_source_name"],
                    self.config.env,
                )
            )

        if ingest_feature["stream_source"] is not None:
            feature_sources.append(
                builder.make_dataset_urn(
                    ingest_feature["stream_source_platform"],
                    ingest_feature["stream_source_name"],
                    self.config.env,
                )
            )

        # append feature name and type
        feature_snapshot.aspects.append(
            MLFeaturePropertiesClass(
                dataType=self.get_field_type(
                    ingest_feature["type"], ingest_feature["name"]
                ),
                sources=feature_sources,
            )
        )

        # make the MCE and workunit
        mce = MetadataChangeEvent(proposedSnapshot=feature_snapshot)
        return MetadataWorkUnit(id=ingest_feature["name"], mce=mce)
Ejemplo n.º 18
0
    def get_workunits(self) -> Iterable[MetadataWorkUnit]:
        with open(self.config.filename, "r") as f:
            mce_obj_list = json.load(f)
        if not isinstance(mce_obj_list, list):
            mce_obj_list = [mce_obj_list]

        for i, obj in enumerate(mce_obj_list):
            mce: MetadataChangeEvent = MetadataChangeEvent.from_obj(obj)
            if not mce.validate():
                raise ValueError(f"failed to parse into valid MCE: {obj}")
            wu = MetadataWorkUnit(f"file://{self.config.filename}:{i}", mce)
            self.report.report_workunit(wu)
            yield wu
Ejemplo n.º 19
0
    def construct_job_workunits(
            self, connector: ConnectorManifest) -> Iterable[MetadataWorkUnit]:
        connector_name = connector.name
        flow_urn = builder.make_data_flow_urn("kafka-connect", connector_name,
                                              self.config.env)

        job_property_bag: Optional[Dict[str, str]] = None

        lineages = connector.lineages
        if lineages:
            for lineage in lineages:
                source_dataset = lineage.source_dataset
                source_platform = lineage.source_platform
                target_dataset = lineage.target_dataset
                target_platform = lineage.target_platform

                job_urn = builder.make_data_job_urn_with_flow(
                    flow_urn, source_dataset)

                inlets = [
                    builder.make_dataset_urn(source_platform, source_dataset)
                ]
                outlets = [
                    builder.make_dataset_urn(target_platform, target_dataset)
                ]

                mce = models.MetadataChangeEventClass(
                    proposedSnapshot=models.DataJobSnapshotClass(
                        urn=job_urn,
                        aspects=[
                            models.DataJobInfoClass(
                                name=f"{connector_name}:{source_dataset}",
                                type="COMMAND",
                                description=None,
                                customProperties=job_property_bag,
                                # externalUrl=job_url,
                            ),
                            models.DataJobInputOutputClass(
                                inputDatasets=inlets or [],
                                outputDatasets=outlets or [],
                            ),
                            # ownership,
                            # tags,
                        ],
                    ))

                wu = MetadataWorkUnit(id=source_dataset, mce=mce)
                self.report.report_workunit(wu)
                yield wu
Ejemplo n.º 20
0
    def get_workunits(
            self) -> Iterable[Union[MetadataWorkUnit, UsageStatsWorkUnit]]:
        for i, obj in enumerate(iterate_generic_file(self.config.filename)):
            if not obj.validate():
                raise ValueError(f"failed to parse: {obj} (index {i})")

            wu: Union[MetadataWorkUnit, UsageStatsWorkUnit]
            if isinstance(obj, UsageAggregationClass):
                wu = UsageStatsWorkUnit(f"file://{self.config.filename}:{i}",
                                        obj)
            else:
                wu = MetadataWorkUnit(f"file://{self.config.filename}:{i}",
                                      obj)
            self.report.report_workunit(wu)
            yield wu
Ejemplo n.º 21
0
    def get_workunits(self) -> Iterable[MetadataWorkUnit]:
        def get_all_tables() -> List[dict]:
            def get_tables_from_database(database_name: str) -> List[dict]:
                new_tables = []
                paginator = self.glue_client.get_paginator("get_tables")
                for page in paginator.paginate(DatabaseName=database_name):
                    new_tables += page["TableList"]

                return new_tables

            def get_database_names() -> List[str]:
                database_names = []
                paginator = self.glue_client.get_paginator("get_databases")
                for page in paginator.paginate():
                    for db in page["DatabaseList"]:
                        if self.source_config.database_pattern.allowed(db["Name"]):
                            database_names.append(db["Name"])

                return database_names

            if self.source_config.database_pattern.is_fully_specified_allow_list():
                database_names = self.source_config.database_pattern.get_allowed_list()
            else:
                database_names = get_database_names()

            all_tables: List[dict] = []
            for database in database_names:
                all_tables += get_tables_from_database(database)
            return all_tables

        tables = get_all_tables()

        for table in tables:
            database_name = table["DatabaseName"]
            table_name = table["Name"]
            full_table_name = f"{database_name}.{table_name}"
            self.report.report_table_scanned()
            if not self.source_config.database_pattern.allowed(
                database_name
            ) or not self.source_config.table_pattern.allowed(full_table_name):
                self.report.report_table_dropped(full_table_name)
                continue

            mce = self._extract_record(table, full_table_name)
            workunit = MetadataWorkUnit(id=f"glue-{full_table_name}", mce=mce)
            self.report.report_workunit(workunit)
            yield workunit
Ejemplo n.º 22
0
    def get_workunits(self) -> Iterable[MetadataWorkUnit]:
        env = "PROD"
        platform = "mongodb"

        database_names: List[str] = self.mongo_client.list_database_names()
        for database_name in database_names:
            if database_name in DENY_DATABASE_LIST:
                continue
            if not self.config.database_pattern.allowed(database_name):
                self.report.report_dropped(database_name)
                continue

            database = self.mongo_client[database_name]
            collection_names: List[str] = database.list_collection_names()
            for collection_name in collection_names:
                dataset_name = f"{database_name}.{collection_name}"
                if not self.config.collection_pattern.allowed(dataset_name):
                    self.report.report_dropped(dataset_name)
                    continue

                dataset_snapshot = DatasetSnapshot(
                    urn=
                    f"urn:li:dataset:(urn:li:dataPlatform:{platform},{dataset_name},{env})",
                    aspects=[],
                )

                dataset_properties = DatasetPropertiesClass(
                    tags=[],
                    customProperties={},
                )
                dataset_snapshot.aspects.append(dataset_properties)

                # TODO: Guess the schema via sampling
                # State of the art seems to be https://github.com/variety/variety.

                # TODO: use list_indexes() or index_information() to get index information
                # See https://pymongo.readthedocs.io/en/stable/api/pymongo/collection.html#pymongo.collection.Collection.list_indexes.

                mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot)
                wu = MetadataWorkUnit(id=dataset_name, mce=mce)
                self.report.report_workunit(wu)
                yield wu
Ejemplo n.º 23
0
    def get_workunits(self) -> Iterable[MetadataWorkUnit]:
        platform = self.platform
        nodes = loadManifestAndCatalog(
            self.config.manifest_path,
            self.config.catalog_path,
            self.config.sources_path,
            self.config.load_schemas,
            self.config.target_platform,
            self.config.env,
            self.config.node_type_pattern,
        )

        for node in nodes:

            dataset_snapshot = DatasetSnapshot(
                urn=node.datahub_urn,
                aspects=[],
            )
            custom_properties = get_custom_properties(node)

            dbt_properties = DatasetPropertiesClass(
                description=node.dbt_name,
                customProperties=custom_properties,
                tags=[],
            )
            dataset_snapshot.aspects.append(dbt_properties)

            upstreams = get_upstream_lineage(node.upstream_urns)
            if upstreams is not None:
                dataset_snapshot.aspects.append(upstreams)

            if self.config.load_schemas:
                schema_metadata = get_schema_metadata(self.report, node,
                                                      platform)
                dataset_snapshot.aspects.append(schema_metadata)

            mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot)
            wu = MetadataWorkUnit(id=dataset_snapshot.urn, mce=mce)
            self.report.report_workunit(wu)

            yield wu
Ejemplo n.º 24
0
    def construct_lineage_workunits(
        self, connector: ConnectorManifest
    ) -> Iterable[MetadataWorkUnit]:

        lineages = connector.lineages
        if lineages:
            for lineage in lineages:
                source_dataset = lineage.source_dataset
                source_platform = lineage.source_platform
                target_dataset = lineage.target_dataset
                target_platform = lineage.target_platform

                mce = models.MetadataChangeEventClass(
                    proposedSnapshot=models.DatasetSnapshotClass(
                        urn=builder.make_dataset_urn(
                            target_platform, target_dataset, self.config.env
                        ),
                        aspects=[
                            models.UpstreamLineageClass(
                                upstreams=[
                                    models.UpstreamClass(
                                        dataset=builder.make_dataset_urn(
                                            source_platform,
                                            source_dataset,
                                            self.config.env,
                                        ),
                                        type=models.DatasetLineageTypeClass.TRANSFORMED,
                                        auditStamp=models.AuditStampClass(
                                            time=builder.get_sys_time(),
                                            actor="urn:li:corpuser:datahub",
                                        ),
                                    )
                                ]
                            )
                        ],
                    )
                )

                wu = MetadataWorkUnit(id=source_dataset, mce=mce)
                self.report.report_workunit(wu)
                yield wu
Ejemplo n.º 25
0
    def emit_chart_mces(self) -> Iterable[MetadataWorkUnit]:
        current_chart_page = 0
        # we will set total charts to the actual number after we get the response
        total_charts = PAGE_SIZE

        while current_chart_page * PAGE_SIZE <= total_charts:
            chart_response = self.session.get(
                f"{self.config.connect_uri}/api/v1/chart",
                params=f"q=(page:{current_chart_page},page_size:{PAGE_SIZE})",
            )
            current_chart_page += 1

            payload = chart_response.json()
            total_charts = payload["count"]
            for chart_data in payload["result"]:
                chart_snapshot = self.construct_chart_from_chart_data(
                    chart_data)

                mce = MetadataChangeEvent(proposedSnapshot=chart_snapshot)
                wu = MetadataWorkUnit(id=chart_snapshot.urn, mce=mce)
                self.report.report_workunit(wu)

                yield wu
Ejemplo n.º 26
0
    def get_feature_table_wu(self, ingest_table):
        """
        Generate an MLFeatureTable workunit for a Feast feature table.

        Parameters
        ----------
            ingest_table:
                ingested Feast table
        """

        featuretable_snapshot = MLFeatureTableSnapshot(
            urn=builder.make_ml_feature_table_urn("feast", ingest_table["name"]),
            aspects=[],
        )

        featuretable_snapshot.aspects.append(
            MLFeatureTablePropertiesClass(
                mlFeatures=[
                    builder.make_ml_feature_urn(
                        ingest_table["name"],
                        feature["name"],
                    )
                    for feature in ingest_table["features"]
                ],
                # a feature table can have multiple primary keys, which then act as a composite key
                mlPrimaryKeys=[
                    builder.make_ml_primary_key_urn(
                        ingest_table["name"], entity["name"]
                    )
                    for entity in ingest_table["entities"]
                ],
            )
        )

        # make the MCE and workunit
        mce = MetadataChangeEvent(proposedSnapshot=featuretable_snapshot)
        return MetadataWorkUnit(id=ingest_table["name"], mce=mce)
Ejemplo n.º 27
0
    def get_workunits(self) -> Iterable[MetadataWorkUnit]:
        platform = "mongodb"

        database_names: List[str] = self.mongo_client.list_database_names()

        # traverse databases in sorted order so output is consistent
        for database_name in sorted(database_names):
            if database_name in DENY_DATABASE_LIST:
                continue
            if not self.config.database_pattern.allowed(database_name):
                self.report.report_dropped(database_name)
                continue

            database = self.mongo_client[database_name]
            collection_names: List[str] = database.list_collection_names()

            # traverse collections in sorted order so output is consistent
            for collection_name in sorted(collection_names):
                dataset_name = f"{database_name}.{collection_name}"

                if not self.config.collection_pattern.allowed(dataset_name):
                    self.report.report_dropped(dataset_name)
                    continue

                dataset_snapshot = DatasetSnapshot(
                    urn=f"urn:li:dataset:(urn:li:dataPlatform:{platform},{dataset_name},{self.config.env})",
                    aspects=[],
                )

                dataset_properties = DatasetPropertiesClass(
                    tags=[],
                    customProperties={},
                )
                dataset_snapshot.aspects.append(dataset_properties)

                if self.config.enableSchemaInference:

                    collection_schema = construct_schema_pymongo(
                        database[collection_name],
                        delimiter=".",
                        sample_size=self.config.schemaSamplingSize,
                    )

                    # initialize the schema for the collection
                    canonical_schema: List[SchemaField] = []

                    # append each schema field (sort so output is consistent)
                    for schema_field in sorted(
                        collection_schema.values(), key=lambda x: x["delimited_name"]
                    ):
                        field = SchemaField(
                            fieldPath=schema_field["delimited_name"],
                            nativeDataType=self.get_pymongo_type_string(
                                schema_field["type"], dataset_name
                            ),
                            type=self.get_field_type(
                                schema_field["type"], dataset_name
                            ),
                            description=None,
                            nullable=schema_field["nullable"],
                            recursive=False,
                        )
                        canonical_schema.append(field)

                    # create schema metadata object for collection
                    actor = "urn:li:corpuser:etl"
                    sys_time = int(time.time() * 1000)
                    schema_metadata = SchemaMetadata(
                        schemaName=collection_name,
                        platform=f"urn:li:dataPlatform:{platform}",
                        version=0,
                        hash="",
                        platformSchema=SchemalessClass(),
                        created=AuditStamp(time=sys_time, actor=actor),
                        lastModified=AuditStamp(time=sys_time, actor=actor),
                        fields=canonical_schema,
                    )

                    dataset_snapshot.aspects.append(schema_metadata)

                # TODO: use list_indexes() or index_information() to get index information
                # See https://pymongo.readthedocs.io/en/stable/api/pymongo/collection.html#pymongo.collection.Collection.list_indexes.

                mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot)
                wu = MetadataWorkUnit(id=dataset_name, mce=mce)
                self.report.report_workunit(wu)
                yield wu
Ejemplo n.º 28
0
    def get_workunits(self) -> Iterable[MetadataWorkUnit]:

        tables = self.get_all_tables()

        for table in tables:
            database_name = table["DatabaseName"]
            table_name = table["Name"]
            full_table_name = f"{database_name}.{table_name}"
            self.report.report_table_scanned()
            if not self.source_config.database_pattern.allowed(
                    database_name
            ) or not self.source_config.table_pattern.allowed(full_table_name):
                self.report.report_table_dropped(full_table_name)
                continue

            mce = self._extract_record(table, full_table_name)
            workunit = MetadataWorkUnit(id=f"glue-{full_table_name}", mce=mce)
            self.report.report_workunit(workunit)
            yield workunit

        if self.extract_transforms:

            dags = {}

            for job in self.get_all_jobs():

                flow_urn = mce_builder.make_data_flow_urn(
                    "glue", job["Name"], self.env)

                flow_wu = self.get_dataflow_wu(flow_urn, job)
                self.report.report_workunit(flow_wu)
                yield flow_wu

                dag = self.get_dataflow_graph(job["Command"]["ScriptLocation"])

                dags[flow_urn] = dag

            # run a first pass to pick up s3 bucket names and formats
            # in Glue, it's possible for two buckets to have files of different extensions
            # if this happens, we append the extension in the URN so the sources can be distinguished
            # see process_dataflow_node() for details

            s3_formats: typing.DefaultDict[str, Set[Union[
                str, None]]] = defaultdict(lambda: set())

            for dag in dags.values():
                for s3_name, extension in self.get_dataflow_s3_names(dag):
                    s3_formats[s3_name].add(extension)

            # run second pass to generate node workunits
            for flow_urn, dag in dags.items():

                nodes, new_dataset_ids, new_dataset_mces = self.process_dataflow_graph(
                    dag, flow_urn, s3_formats)

                for node in nodes.values():

                    if node["NodeType"] not in ["DataSource", "DataSink"]:
                        job_wu = self.get_datajob_wu(node, job)
                        self.report.report_workunit(job_wu)
                        yield job_wu

                for dataset_id, dataset_mce in zip(new_dataset_ids,
                                                   new_dataset_mces):

                    dataset_wu = MetadataWorkUnit(id=dataset_id,
                                                  mce=dataset_mce)
                    self.report.report_workunit(dataset_wu)
                    yield dataset_wu
Ejemplo n.º 29
0
 def __init__(self):
     self.source_report = SourceReport()
     self.work_units: List[MetadataWorkUnit] = [
         MetadataWorkUnit(id="workunit-1", mce=get_initial_mce())
     ]