Esempio n. 1
0
    def get_lineage_mcp(
            self, dataset_urn: str) -> Optional[MetadataChangeProposalWrapper]:
        if self.lineage_metadata is None:
            return None
        dataset_key: Optional[DatasetKey] = mce_builder.dataset_urn_to_key(
            dataset_urn)
        if dataset_key is None:
            return None
        project_id, dataset_name, tablename = dataset_key.name.split(".")
        bq_table = BigQueryTableRef(project_id, dataset_name, tablename)
        if str(bq_table) in self.lineage_metadata:
            upstream_list: List[UpstreamClass] = []
            # Sorting the list of upstream lineage events in order to avoid creating multiple aspects in backend
            # even if the lineage is same but the order is different.
            for ref_table in sorted(self.lineage_metadata[str(bq_table)]):
                upstream_table = BigQueryTableRef.from_string_name(ref_table)
                upstream_table_class = UpstreamClass(
                    mce_builder.make_dataset_urn(
                        self.platform,
                        "{project}.{database}.{table}".format(
                            project=upstream_table.project,
                            database=upstream_table.dataset,
                            table=upstream_table.table,
                        ),
                        self.config.env,
                    ),
                    DatasetLineageTypeClass.TRANSFORMED,
                )
                upstream_list.append(upstream_table_class)

            if upstream_list:
                upstream_lineage = UpstreamLineageClass(
                    upstreams=upstream_list)
                mcp = MetadataChangeProposalWrapper(
                    entityType="dataset",
                    changeType=ChangeTypeClass.UPSERT,
                    entityUrn=dataset_urn,
                    aspectName="upstreamLineage",
                    aspect=upstream_lineage,
                )
                return mcp
        return None
Esempio n. 2
0
 def get_upstream_tables(
         self,
         bq_table: str,
         tables_seen: List[str] = []) -> Set[BigQueryTableRef]:
     upstreams: Set[BigQueryTableRef] = set()
     assert self.lineage_metadata
     for ref_table in self.lineage_metadata[str(bq_table)]:
         upstream_table = BigQueryTableRef.from_string_name(ref_table)
         if upstream_table.is_temporary_table():
             # making sure we don't process a table twice and not get into a recurisve loop
             if ref_table in tables_seen:
                 logger.debug(
                     f"Skipping table {ref_table} because it was seen already"
                 )
                 continue
             tables_seen.append(ref_table)
             if ref_table in self.lineage_metadata:
                 upstreams = upstreams.union(
                     self.get_upstream_tables(ref_table,
                                              tables_seen=tables_seen))
         else:
             upstreams.add(upstream_table)
     return upstreams