Example #1
0
def make_lineage_mce(
    upstream_urns: List[str],
    downstream_urn: str,
    actor: str = make_user_urn("datahub"),
    lineage_type: str = DatasetLineageTypeClass.TRANSFORMED,
) -> MetadataChangeEventClass:
    sys_time = get_sys_time()

    mce = MetadataChangeEventClass(
        proposedSnapshot=DatasetSnapshotClass(
            urn=downstream_urn,
            aspects=[
                UpstreamLineageClass(
                    upstreams=[
                        UpstreamClass(
                            auditStamp=AuditStampClass(
                                time=sys_time,
                                actor=actor,
                            ),
                            dataset=upstream_urn,
                            type=lineage_type,
                        )
                        for upstream_urn in upstream_urns
                    ]
                )
            ],
        )
    )
    return mce
Example #2
0
 def get_lineage_if_enabled(
     self, mce: MetadataChangeEventClass
 ) -> Optional[MetadataChangeProposalWrapper]:
     if self.source_config.emit_s3_lineage:
         # extract dataset properties aspect
         dataset_properties: Optional[
             DatasetPropertiesClass] = mce_builder.get_aspect_if_available(
                 mce, DatasetPropertiesClass)
         if dataset_properties and "Location" in dataset_properties.customProperties:
             location = dataset_properties.customProperties["Location"]
             if location.startswith("s3://"):
                 s3_dataset_urn = make_s3_urn(location,
                                              self.source_config.env)
                 if self.source_config.glue_s3_lineage_direction == "upstream":
                     upstream_lineage = UpstreamLineageClass(upstreams=[
                         UpstreamClass(
                             dataset=s3_dataset_urn,
                             type=DatasetLineageTypeClass.COPY,
                         )
                     ])
                     mcp = MetadataChangeProposalWrapper(
                         entityType="dataset",
                         entityUrn=mce.proposedSnapshot.urn,
                         changeType=ChangeTypeClass.UPSERT,
                         aspectName="upstreamLineage",
                         aspect=upstream_lineage,
                     )
                     return mcp
                 else:
                     # Need to mint the s3 dataset with upstream lineage from it to glue
                     upstream_lineage = UpstreamLineageClass(upstreams=[
                         UpstreamClass(
                             dataset=mce.proposedSnapshot.urn,
                             type=DatasetLineageTypeClass.COPY,
                         )
                     ])
                     mcp = MetadataChangeProposalWrapper(
                         entityType="dataset",
                         entityUrn=s3_dataset_urn,
                         changeType=ChangeTypeClass.UPSERT,
                         aspectName="upstreamLineage",
                         aspect=upstream_lineage,
                     )
                     return mcp
     return None
Example #3
0
    def get_lineage_mcp(
            self, dataset_urn: str) -> Optional[MetadataChangeProposalWrapper]:
        if self.lineage_metadata is None:
            logger.debug("No lineage metadata so skipping getting mcp")
            return None
        dataset_key: Optional[DatasetKey] = mce_builder.dataset_urn_to_key(
            dataset_urn)
        if dataset_key is None:
            logger.debug(
                f"No dataset_key for {dataset_urn} so skipping getting mcp")
            return None
        project_id, dataset_name, tablename = dataset_key.name.split(".")
        bq_table = BigQueryTableRef(project_id, dataset_name, tablename)
        if str(bq_table) in self.lineage_metadata:
            upstream_list: List[UpstreamClass] = []
            # Sorting the list of upstream lineage events in order to avoid creating multiple aspects in backend
            # even if the lineage is same but the order is different.
            for upstream_table in sorted(
                    self.get_upstream_tables(str(bq_table), tables_seen=[])):
                upstream_table_class = UpstreamClass(
                    mce_builder.make_dataset_urn_with_platform_instance(
                        self.platform,
                        "{project}.{database}.{table}".format(
                            project=upstream_table.project,
                            database=upstream_table.dataset,
                            table=upstream_table.table,
                        ),
                        self.config.platform_instance,
                        self.config.env,
                    ),
                    DatasetLineageTypeClass.TRANSFORMED,
                )
                if self.config.upstream_lineage_in_report:
                    current_lineage_map: Set = self.report.upstream_lineage.get(
                        str(bq_table), set())
                    current_lineage_map.add(str(upstream_table))
                    self.report.upstream_lineage[str(
                        bq_table)] = current_lineage_map
                upstream_list.append(upstream_table_class)

            if upstream_list:
                upstream_lineage = UpstreamLineageClass(
                    upstreams=upstream_list)
                mcp = MetadataChangeProposalWrapper(
                    entityType="dataset",
                    changeType=ChangeTypeClass.UPSERT,
                    entityUrn=dataset_urn,
                    aspectName="upstreamLineage",
                    aspect=upstream_lineage,
                )
                return mcp
        return None
Example #4
0
def make_lineage_mce(
    upstream_urns: List[str],
    downstream_urn: str,
    lineage_type: str = DatasetLineageTypeClass.TRANSFORMED,
) -> MetadataChangeEventClass:
    mce = MetadataChangeEventClass(proposedSnapshot=DatasetSnapshotClass(
        urn=downstream_urn,
        aspects=[
            UpstreamLineageClass(upstreams=[
                UpstreamClass(
                    dataset=upstream_urn,
                    type=lineage_type,
                ) for upstream_urn in upstream_urns
            ])
        ],
    ))
    return mce
Example #5
0
def create_lineage_aspect_mce(
        directive: Directive) -> MetadataChangeEventClass:
    return MetadataChangeEventClass(proposedSnapshot=DatasetSnapshotClass(
        urn=dataset_name_to_urn(directive.table),
        aspects=[
            UpstreamLineageClass(upstreams=[
                UpstreamClass(
                    dataset=dataset_name_to_urn(upstream),
                    type=DatasetLineageTypeClass.TRANSFORMED,
                    auditStamp=AuditStampClass(
                        time=int(time.time() * 1000),
                        actor="urn:li:corpuser:datahub",
                    ),
                ) for upstream in directive.depends_on
            ])
        ],
    ))
Example #6
0
    def get_lineage_mcp(
            self, dataset_urn: str) -> Optional[MetadataChangeProposalWrapper]:
        if self.lineage_metadata is None:
            return None
        dataset_key: Optional[DatasetKey] = mce_builder.dataset_urn_to_key(
            dataset_urn)
        if dataset_key is None:
            return None
        project_id, dataset_name, tablename = dataset_key.name.split(".")
        bq_table = BigQueryTableRef(project_id, dataset_name, tablename)
        if str(bq_table) in self.lineage_metadata:
            upstream_list: List[UpstreamClass] = []
            # Sorting the list of upstream lineage events in order to avoid creating multiple aspects in backend
            # even if the lineage is same but the order is different.
            for ref_table in sorted(self.lineage_metadata[str(bq_table)]):
                upstream_table = BigQueryTableRef.from_string_name(ref_table)
                upstream_table_class = UpstreamClass(
                    mce_builder.make_dataset_urn(
                        self.platform,
                        "{project}.{database}.{table}".format(
                            project=upstream_table.project,
                            database=upstream_table.dataset,
                            table=upstream_table.table,
                        ),
                        self.config.env,
                    ),
                    DatasetLineageTypeClass.TRANSFORMED,
                )
                upstream_list.append(upstream_table_class)

            if upstream_list:
                upstream_lineage = UpstreamLineageClass(
                    upstreams=upstream_list)
                mcp = MetadataChangeProposalWrapper(
                    entityType="dataset",
                    changeType=ChangeTypeClass.UPSERT,
                    entityUrn=dataset_urn,
                    aspectName="upstreamLineage",
                    aspect=upstream_lineage,
                )
                return mcp
        return None
Example #7
0
    def get_lineage_mcp(
            self, dataset_urn: str) -> Optional[MetadataChangeProposalWrapper]:
        dataset_key = mce_builder.dataset_urn_to_key(dataset_urn)
        if dataset_key is None:
            return None

        dataset_params = dataset_key.name.split(".")
        db_name = dataset_params[0]
        schemaname = dataset_params[1]
        tablename = dataset_params[2]
        if db_name in self.catalog_metadata:
            if schemaname in self.catalog_metadata[db_name]:
                external_db_params = self.catalog_metadata[db_name][schemaname]
                upstream_lineage = UpstreamLineageClass(upstreams=[
                    UpstreamClass(
                        mce_builder.make_dataset_urn(
                            self.eskind_to_platform[
                                external_db_params["eskind"]],
                            "{database}.{table}".format(
                                database=external_db_params[
                                    "external_database"],
                                table=tablename,
                            ),
                            self.config.env,
                        ),
                        DatasetLineageTypeClass.COPY,
                    )
                ])
                mcp = MetadataChangeProposalWrapper(
                    entityType="dataset",
                    changeType=ChangeTypeClass.UPSERT,
                    entityUrn=dataset_urn,
                    aspectName="upstreamLineage",
                    aspect=upstream_lineage,
                )
                return mcp
        return None