Esempio n. 1
0
    def construct_lineage_workunits(
            self, connector: ConnectorManifest) -> Iterable[MetadataWorkUnit]:

        lineages = connector.lineages
        if lineages:
            for lineage in lineages:
                source_dataset = lineage.source_dataset
                source_platform = lineage.source_platform
                target_dataset = lineage.target_dataset
                target_platform = lineage.target_platform

                mce = models.MetadataChangeEventClass(
                    proposedSnapshot=models.DatasetSnapshotClass(
                        urn=builder.make_dataset_urn(
                            target_platform, target_dataset, self.config.env),
                        aspects=[
                            models.UpstreamLineageClass(upstreams=[
                                models.UpstreamClass(
                                    dataset=builder.make_dataset_urn(
                                        source_platform,
                                        source_dataset,
                                        self.config.env,
                                    ),
                                    type=models.DatasetLineageTypeClass.
                                    TRANSFORMED,
                                )
                            ])
                        ],
                    ))

                wu = MetadataWorkUnit(id=source_dataset, mce=mce)
                self.report.report_workunit(wu)
                yield wu
Esempio n. 2
0

@pytest.mark.parametrize(
    "record,path,snapshot",
    [
        (
            # Simple test.
            models.
            MetadataChangeEventClass(proposedSnapshot=models.DatasetSnapshotClass(
                urn=
                "urn:li:dataset:(urn:li:dataPlatform:bigquery,downstream,PROD)",
                aspects=[
                    models.UpstreamLineageClass(upstreams=[
                        models.UpstreamClass(
                            auditStamp=basicAuditStamp,
                            dataset=
                            "urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream1,PROD)",
                            type="TRANSFORMED",
                        ),
                        models.UpstreamClass(
                            auditStamp=basicAuditStamp,
                            dataset=
                            "urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream2,PROD)",
                            type="TRANSFORMED",
                        ),
                    ])
                ],
            ), ),
            "/entities?action=ingest",
            {
                "entity": {
                    "value": {
Esempio n. 3
0
    def get_lineage_metadata_change_event_proposal(
            entities: List[EntityNodeConfig], preserve_upstream: bool
    ) -> Iterable[MetadataChangeProposalWrapper]:
        """
        Builds a list of events to be emitted to datahub by going through each entity and its upstream nodes
        :param preserve_upstream: This field determines if we want to query the datahub backend to extract
        the existing upstream lineages for each entity and preserve it
        :param entities: A list of entities we want to build a proposal on
        :return: Returns a list of metadata change event proposals to be emitted to datahub
        """
        def _get_entity_urn(entity_config: EntityConfig) -> Optional[str]:
            """Helper inner function to extract a given entity_urn
            A return value of None represents an unsupported entity type
            """
            if entity_config.type == "dataset":
                return make_dataset_urn_with_platform_instance(
                    platform=entity_config.platform,
                    name=entity_config.name,
                    env=entity_config.env,
                    platform_instance=entity_config.platform_instance,
                )
            logger.warning(
                f"Entity type: {entity_config.type} is not supported!")
            return None

        # loop through all the entities
        for entity_node in entities:
            new_upstreams: List[models.UpstreamClass] = []
            # if this entity has upstream nodes defined, we'll want to do some work.
            # if no upstream nodes are present, we don't emit an MCP for it.
            if entity_node.upstream:
                entity = entity_node.entity
                logger.info(
                    f"Upstream detected for {entity}. Extracting urn...")
                entity_urn = _get_entity_urn(entity)
                if entity_urn:
                    # extract the old lineage and save it for the new mcp
                    if preserve_upstream:
                        old_upstream_lineage = get_aspects_for_entity(
                            entity_urn=entity_urn,
                            aspects=["upstreamLineage"],
                            typed=True,
                        ).get("upstreamLineage")
                        if old_upstream_lineage:
                            # Can't seem to get mypy to be happy about
                            # `Argument 1 to "list" has incompatible type "Optional[Any]";
                            # expected "Iterable[UpstreamClass]"`
                            new_upstreams.extend(
                                old_upstream_lineage.get(
                                    "upstreams")  # type: ignore
                            )
                    for upstream_entity_node in entity_node.upstream:
                        upstream_entity = upstream_entity_node.entity
                        upstream_entity_urn = _get_entity_urn(upstream_entity)
                        if upstream_entity_urn:
                            new_upstream = models.UpstreamClass(
                                dataset=upstream_entity_urn,
                                type=models.DatasetLineageTypeClass.
                                TRANSFORMED,
                                auditStamp=auditStamp,
                            )
                            new_upstreams.append(new_upstream)
                        else:
                            logger.warning(
                                f"Entity type: {upstream_entity.type} is unsupported. Upstream lineage will be skipped "
                                f"for {upstream_entity.name}->{entity.name}")
                    new_upstream_lineage = models.UpstreamLineageClass(
                        upstreams=new_upstreams)
                    yield MetadataChangeProposalWrapper(
                        entityType=entity.type,
                        changeType=models.ChangeTypeClass.UPSERT,
                        entityUrn=entity_urn,
                        aspectName="upstreamLineage",
                        aspect=new_upstream_lineage,
                    )
                else:
                    logger.warning(
                        f"Entity type: {entity.type} is unsupported. Entity node {entity.name} and its "
                        f"upstream lineages will be skipped")